diff --git a/.config/.last_opt_in_prompt.yaml b/.config/.last_opt_in_prompt.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0967ef424bce6791893e9a57bb952f80fd536e93 --- /dev/null +++ b/.config/.last_opt_in_prompt.yaml @@ -0,0 +1 @@ +{} diff --git a/.config/.last_survey_prompt.yaml b/.config/.last_survey_prompt.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e96972851c5815d40b315aa06c738d2f83ac7a6e --- /dev/null +++ b/.config/.last_survey_prompt.yaml @@ -0,0 +1 @@ +last_prompt_time: 1752154462.9134393 diff --git a/.config/.last_update_check.json b/.config/.last_update_check.json new file mode 100644 index 0000000000000000000000000000000000000000..9884fd0205f6e2fd85b109120405ffa5dcaf438e --- /dev/null +++ b/.config/.last_update_check.json @@ -0,0 +1 @@ +{"last_update_check_time": 1752154468.6583943, "last_update_check_revision": 20250627154417, "notifications": [], "last_nag_times": {}} \ No newline at end of file diff --git a/.config/active_config b/.config/active_config new file mode 100644 index 0000000000000000000000000000000000000000..331d858ce9b12fa6720414196a9dd6e0b6a0faaa --- /dev/null +++ b/.config/active_config @@ -0,0 +1 @@ +default \ No newline at end of file diff --git a/.config/config_sentinel b/.config/config_sentinel new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.config/configurations/config_default b/.config/configurations/config_default new file mode 100644 index 0000000000000000000000000000000000000000..ee06685b6841afd85a59e8ea5bc7ee8a27d6fe74 --- /dev/null +++ b/.config/configurations/config_default @@ -0,0 +1,6 @@ +[component_manager] +disable_update_check = true + +[compute] +gce_metadata_read_timeout_sec = 0 + diff --git a/.config/default_configs.db b/.config/default_configs.db new file mode 100644 index 0000000000000000000000000000000000000000..e8a2c56e9e0369b0e66531a0ddfec7c2b10a73ee Binary files /dev/null and b/.config/default_configs.db differ diff --git a/.config/gce b/.config/gce new file mode 100644 index 0000000000000000000000000000000000000000..c1f22fbc23bb6ee67824843d6685826db10313d3 --- /dev/null +++ b/.config/gce @@ -0,0 +1 @@ +False \ No newline at end of file diff --git a/.config/hidden_gcloud_config_universe_descriptor_data_cache_configs.db b/.config/hidden_gcloud_config_universe_descriptor_data_cache_configs.db new file mode 100644 index 0000000000000000000000000000000000000000..285224b1b8eac03cfb63b9479efca1f68b6b7db5 Binary files /dev/null and b/.config/hidden_gcloud_config_universe_descriptor_data_cache_configs.db differ diff --git a/.config/logs/2025.07.10/13.33.45.486303.log b/.config/logs/2025.07.10/13.33.45.486303.log new file mode 100644 index 0000000000000000000000000000000000000000..f2e6c1e2b1b312fdf2d8902ebbf98680949df159 --- /dev/null +++ b/.config/logs/2025.07.10/13.33.45.486303.log @@ -0,0 +1,765 @@ +2025-07-10 13:33:57,507 DEBUG root Loaded Command Group: ['gcloud', 'components'] +2025-07-10 13:33:57,511 DEBUG root Loaded Command Group: ['gcloud', 'components', 'update'] +2025-07-10 13:33:57,513 DEBUG root Running [gcloud.components.update] with arguments: [--compile-python: "True", --quiet: "True", COMPONENT-IDS:6: "['core', 'gcloud-deps', 'bq', 'gcloud', 'gcloud-crc32c', 'gsutil']"] +2025-07-10 13:33:57,514 INFO ___FILE_ONLY___ Beginning update. This process may take several minutes. + +2025-07-10 13:33:57,545 DEBUG urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443 +2025-07-10 13:33:57,735 DEBUG urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components-2.json HTTP/11" 200 239798 +2025-07-10 13:33:57,747 INFO ___FILE_ONLY___ + +2025-07-10 13:33:57,748 INFO ___FILE_ONLY___ +Your current Google Cloud CLI version is: 529.0.0 + +2025-07-10 13:33:57,748 INFO ___FILE_ONLY___ Installing components from version: 529.0.0 + +2025-07-10 13:33:57,748 INFO ___FILE_ONLY___ + +2025-07-10 13:33:57,748 DEBUG root Chosen display Format:table[box,title="These components will be removed."](details.display_name:label=Name:align=left,version.version_string:label=Version:align=right,data.size.size(zero="",min=1048576):label=Size:align=right) +2025-07-10 13:33:57,748 DEBUG root Chosen display Format:table[box,title="These components will be updated."](details.display_name:label=Name:align=left,version.version_string:label=Version:align=right,data.size.size(zero="",min=1048576):label=Size:align=right) +2025-07-10 13:33:57,749 DEBUG root Chosen display Format:table[box,title="These components will be installed."](details.display_name:label=Name:align=left,version.version_string:label=Version:align=right,data.size.size(zero="",min=1048576):label=Size:align=right) +2025-07-10 13:33:57,786 INFO ___FILE_ONLY___ ┌─────────────────────────────────────────────────────────────────────────────┐ +2025-07-10 13:33:57,786 INFO ___FILE_ONLY___ + +2025-07-10 13:33:57,787 INFO ___FILE_ONLY___ │ These components will be installed. │ +2025-07-10 13:33:57,787 INFO ___FILE_ONLY___ + +2025-07-10 13:33:57,787 INFO ___FILE_ONLY___ ├─────────────────────────────────────────────────────┬────────────┬──────────┤ +2025-07-10 13:33:57,787 INFO ___FILE_ONLY___ + +2025-07-10 13:33:57,787 INFO ___FILE_ONLY___ │ Name │ Version │ Size │ +2025-07-10 13:33:57,787 INFO ___FILE_ONLY___ + +2025-07-10 13:33:57,787 INFO ___FILE_ONLY___ ├─────────────────────────────────────────────────────┼────────────┼──────────┤ +2025-07-10 13:33:57,787 INFO ___FILE_ONLY___ + +2025-07-10 13:33:57,787 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,787 INFO ___FILE_ONLY___ BigQuery Command Line Tool +2025-07-10 13:33:57,787 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,787 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,787 INFO ___FILE_ONLY___ 2.1.19 +2025-07-10 13:33:57,787 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,787 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,787 INFO ___FILE_ONLY___ 1.8 MiB +2025-07-10 13:33:57,787 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,787 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,787 INFO ___FILE_ONLY___ + +2025-07-10 13:33:57,788 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,788 INFO ___FILE_ONLY___ BigQuery Command Line Tool (Platform Specific) +2025-07-10 13:33:57,788 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,788 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,788 INFO ___FILE_ONLY___ 2.1.17 +2025-07-10 13:33:57,788 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,788 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,788 INFO ___FILE_ONLY___ < 1 MiB +2025-07-10 13:33:57,788 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,788 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,788 INFO ___FILE_ONLY___ + +2025-07-10 13:33:57,788 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,788 INFO ___FILE_ONLY___ Bundled Python 3.12 (Platform Specific) +2025-07-10 13:33:57,788 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,788 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,788 INFO ___FILE_ONLY___ 3.12.9 +2025-07-10 13:33:57,788 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,788 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,788 INFO ___FILE_ONLY___ 89.3 MiB +2025-07-10 13:33:57,788 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,788 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,788 INFO ___FILE_ONLY___ + +2025-07-10 13:33:57,789 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,789 INFO ___FILE_ONLY___ Cloud Storage Command Line Tool +2025-07-10 13:33:57,789 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,789 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,789 INFO ___FILE_ONLY___ 5.35 +2025-07-10 13:33:57,789 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,789 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,789 INFO ___FILE_ONLY___ 12.4 MiB +2025-07-10 13:33:57,789 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,789 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,789 INFO ___FILE_ONLY___ + +2025-07-10 13:33:57,789 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,789 INFO ___FILE_ONLY___ Cloud Storage Command Line Tool (Platform Specific) +2025-07-10 13:33:57,789 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,789 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,789 INFO ___FILE_ONLY___ 5.34 +2025-07-10 13:33:57,789 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,789 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,789 INFO ___FILE_ONLY___ < 1 MiB +2025-07-10 13:33:57,789 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,790 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,790 INFO ___FILE_ONLY___ + +2025-07-10 13:33:57,790 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,790 INFO ___FILE_ONLY___ Google Cloud CLI Core Libraries (Platform Specific) +2025-07-10 13:33:57,790 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,790 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,790 INFO ___FILE_ONLY___ 2025.05.23 +2025-07-10 13:33:57,790 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,790 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,790 INFO ___FILE_ONLY___ < 1 MiB +2025-07-10 13:33:57,790 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,790 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,790 INFO ___FILE_ONLY___ + +2025-07-10 13:33:57,790 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,790 INFO ___FILE_ONLY___ Google Cloud CRC32C Hash Tool (Platform Specific) +2025-07-10 13:33:57,790 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,790 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,790 INFO ___FILE_ONLY___ 1.0.0 +2025-07-10 13:33:57,790 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,790 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,790 INFO ___FILE_ONLY___ 1.5 MiB +2025-07-10 13:33:57,791 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,791 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,791 INFO ___FILE_ONLY___ + +2025-07-10 13:33:57,791 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,791 INFO ___FILE_ONLY___ gcloud cli dependencies (Platform Specific) +2025-07-10 13:33:57,791 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,791 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,791 INFO ___FILE_ONLY___ 2021.04.16 +2025-07-10 13:33:57,791 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,791 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,791 INFO ___FILE_ONLY___ < 1 MiB +2025-07-10 13:33:57,791 INFO ___FILE_ONLY___ +2025-07-10 13:33:57,791 INFO ___FILE_ONLY___ │ +2025-07-10 13:33:57,791 INFO ___FILE_ONLY___ + +2025-07-10 13:33:57,791 INFO ___FILE_ONLY___ └─────────────────────────────────────────────────────┴────────────┴──────────┘ +2025-07-10 13:33:57,791 INFO ___FILE_ONLY___ + +2025-07-10 13:33:57,791 INFO ___FILE_ONLY___ + +2025-07-10 13:33:57,795 DEBUG urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443 +2025-07-10 13:33:58,889 DEBUG urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/RELEASE_NOTES HTTP/11" 200 1444035 +2025-07-10 13:33:59,394 INFO ___FILE_ONLY___ For the latest full release notes, please visit: + https://cloud.google.com/sdk/release_notes + + +2025-07-10 13:33:59,395 INFO ___FILE_ONLY___ Performing in place update... + + +2025-07-10 13:33:59,397 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:33:59,397 INFO ___FILE_ONLY___ ╠═ Downloading: BigQuery Command Line Tool ═╣ + +2025-07-10 13:33:59,397 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:33:59,400 DEBUG urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443 +2025-07-10 13:34:00,464 DEBUG urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-bq-20250627154417.tar.gz HTTP/11" 200 1850167 +2025-07-10 13:34:00,477 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,477 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,477 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,477 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,477 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,477 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,477 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,477 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,477 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,478 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,478 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,478 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,478 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,478 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,478 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,478 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,478 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,478 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,478 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,478 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,479 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,479 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,479 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,479 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,479 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,479 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,479 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,479 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,479 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,479 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,479 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,480 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,480 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,480 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,480 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,480 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,480 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,480 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,480 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,480 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,480 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,480 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,481 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,481 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,481 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,481 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,481 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,481 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,481 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,481 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,481 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,481 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,482 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,482 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,482 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,482 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,482 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,482 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,482 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,482 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:00,482 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:00,484 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:00,485 INFO ___FILE_ONLY___ ╠═ Downloading: BigQuery Command Line Tool (Platform Spe... ═╣ + +2025-07-10 13:34:00,485 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:00,489 DEBUG urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443 +2025-07-10 13:34:01,535 DEBUG urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-bq-nix-20250523104322.tar.gz HTTP/11" 200 1935 +2025-07-10 13:34:01,536 INFO ___FILE_ONLY___ ════════════════════════════════════════════════════════════ +2025-07-10 13:34:01,536 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:01,538 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:01,539 INFO ___FILE_ONLY___ ╠═ Downloading: Bundled Python 3.12 ═╣ + +2025-07-10 13:34:01,539 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:01,539 INFO ___FILE_ONLY___ ════════════════════════════════════════════════════════════ +2025-07-10 13:34:01,539 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:01,540 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:01,541 INFO ___FILE_ONLY___ ╠═ Downloading: Bundled Python 3.12 (Platform Specific) ═╣ + +2025-07-10 13:34:01,541 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:01,544 DEBUG urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443 +2025-07-10 13:34:02,655 DEBUG urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-bundled-python3-unix-linux-x86_64-20250502143716.tar.gz HTTP/11" 200 93610468 +2025-07-10 13:34:03,004 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,006 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,008 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,010 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,012 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,014 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,016 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,018 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,020 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,022 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,024 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,025 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,027 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,029 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,031 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,033 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,035 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,037 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,039 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,041 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,043 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,045 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,047 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,049 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,051 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,053 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,054 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,056 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,058 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,060 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,062 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,064 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,066 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,068 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,070 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,072 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,074 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,076 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,077 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,079 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,081 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,083 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,085 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,087 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,089 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,091 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,093 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,095 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,097 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,099 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,101 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,103 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,105 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,107 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,109 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,111 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,113 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,115 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,117 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,118 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:03,119 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:03,121 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:03,121 INFO ___FILE_ONLY___ ╠═ Downloading: Cloud Storage Command Line Tool ═╣ + +2025-07-10 13:34:03,121 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:03,124 DEBUG urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443 +2025-07-10 13:34:04,188 DEBUG urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-gsutil-20250627154417.tar.gz HTTP/11" 200 12962791 +2025-07-10 13:34:04,237 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,237 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,238 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,238 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,238 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,239 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,239 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,239 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,239 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,240 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,240 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,240 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,241 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,241 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,241 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,242 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,242 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,242 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,242 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,243 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,243 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,243 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,244 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,244 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,244 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,245 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,245 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,245 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,246 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,246 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,246 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,247 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,247 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,247 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,248 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,248 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,248 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,249 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,249 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,249 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,250 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,250 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,250 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,250 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,251 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,251 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,252 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,252 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,252 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,253 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,253 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,253 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,254 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,254 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,254 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,255 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,255 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,255 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,255 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,256 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:04,256 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:04,258 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:04,258 INFO ___FILE_ONLY___ ╠═ Downloading: Cloud Storage Command Line Tool (Platfor... ═╣ + +2025-07-10 13:34:04,258 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:04,261 DEBUG urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443 +2025-07-10 13:34:05,378 DEBUG urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-gsutil-nix-20250523104322.tar.gz HTTP/11" 200 1950 +2025-07-10 13:34:05,378 INFO ___FILE_ONLY___ ════════════════════════════════════════════════════════════ +2025-07-10 13:34:05,378 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:05,380 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:05,380 INFO ___FILE_ONLY___ ╠═ Downloading: Default set of gcloud commands ═╣ + +2025-07-10 13:34:05,380 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:05,381 INFO ___FILE_ONLY___ ════════════════════════════════════════════════════════════ +2025-07-10 13:34:05,381 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:05,382 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:05,382 INFO ___FILE_ONLY___ ╠═ Downloading: Google Cloud CLI Core Libraries (Platfor... ═╣ + +2025-07-10 13:34:05,382 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:05,386 DEBUG urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443 +2025-07-10 13:34:05,571 DEBUG urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-core-nix-20250523104322.tar.gz HTTP/11" 200 2325 +2025-07-10 13:34:05,571 INFO ___FILE_ONLY___ ════════════════════════════════════════════════════════════ +2025-07-10 13:34:05,571 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:05,573 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:05,574 INFO ___FILE_ONLY___ ╠═ Downloading: Google Cloud CRC32C Hash Tool ═╣ + +2025-07-10 13:34:05,574 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:05,574 INFO ___FILE_ONLY___ ════════════════════════════════════════════════════════════ +2025-07-10 13:34:05,574 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:05,575 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:05,576 INFO ___FILE_ONLY___ ╠═ Downloading: Google Cloud CRC32C Hash Tool (Platform ... ═╣ + +2025-07-10 13:34:05,576 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:05,579 DEBUG urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443 +2025-07-10 13:34:05,772 DEBUG urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-gcloud-crc32c-linux-x86_64-20250613150750.tar.gz HTTP/11" 200 1525557 +2025-07-10 13:34:05,783 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,784 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,784 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,784 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,784 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,784 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,784 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,784 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,784 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,784 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,784 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,784 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,784 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,785 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,785 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,785 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,785 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,785 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,785 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,785 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,785 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,785 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,785 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,785 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,786 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,786 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,786 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,786 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,786 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,786 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,786 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,786 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,786 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,786 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,786 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,786 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,786 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,787 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,787 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,787 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,787 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,787 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,787 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,787 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,787 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,787 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,787 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,787 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,788 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,788 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,788 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,788 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,788 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,788 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,788 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,788 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,788 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,789 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,789 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,789 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:05,789 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:05,791 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:05,791 INFO ___FILE_ONLY___ ╠═ Downloading: gcloud cli dependencies (Platform Specific) ═╣ + +2025-07-10 13:34:05,791 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:05,794 DEBUG urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443 +2025-07-10 13:34:06,857 DEBUG urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-gcloud-deps-linux-x86_64-20210416153011.tar.gz HTTP/11" 200 104 +2025-07-10 13:34:06,857 INFO ___FILE_ONLY___ ════════════════════════════════════════════════════════════ +2025-07-10 13:34:06,857 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:06,860 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:06,860 INFO ___FILE_ONLY___ ╠═ Installing: BigQuery Command Line Tool ═╣ + +2025-07-10 13:34:06,860 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:06,972 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:06,975 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:06,977 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:06,979 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:06,982 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:06,984 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:06,987 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:06,989 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:06,991 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:06,993 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:06,996 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:06,998 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,000 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,003 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,005 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,007 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,010 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,012 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,014 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,017 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,020 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,022 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,024 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,027 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,028 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,030 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,033 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,035 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,037 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,041 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,043 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,046 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,047 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,051 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,054 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,061 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,066 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,071 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,073 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,075 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,078 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,080 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,083 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,087 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,089 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,091 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,093 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,095 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,097 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,100 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,102 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,104 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,106 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,109 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,111 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,113 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,115 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,117 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,119 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,121 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:07,121 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:07,129 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:07,129 INFO ___FILE_ONLY___ ╠═ Installing: BigQuery Command Line Tool (Platform Spec... ═╣ + +2025-07-10 13:34:07,129 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:07,130 INFO ___FILE_ONLY___ ════════════════════════════════════════════════════════════ +2025-07-10 13:34:07,130 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:07,136 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:07,136 INFO ___FILE_ONLY___ ╠═ Installing: Bundled Python 3.12 ═╣ + +2025-07-10 13:34:07,136 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:07,138 INFO ___FILE_ONLY___ ════════════════════════════════════════════════════════════ +2025-07-10 13:34:07,138 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:07,140 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:07,140 INFO ___FILE_ONLY___ ╠═ Installing: Bundled Python 3.12 (Platform Specific) ═╣ + +2025-07-10 13:34:07,140 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:09,407 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,420 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,432 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,446 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,459 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,472 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,486 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,499 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,511 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,524 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,537 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,550 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,562 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,575 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,589 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,602 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,615 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,628 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,642 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,655 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,669 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,686 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,701 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,717 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,733 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,747 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,762 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,778 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,794 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,809 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,823 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,837 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,851 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,866 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,884 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:09,901 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:10,813 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:10,840 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:11,365 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:11,385 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:11,408 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:11,436 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:11,456 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:11,485 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:11,508 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:11,528 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:11,547 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:11,566 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:11,660 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:11,680 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:11,825 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:11,843 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:11,861 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:11,879 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:11,896 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:11,917 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:11,938 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:11,958 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:11,990 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:12,496 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:12,496 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:12,564 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:12,564 INFO ___FILE_ONLY___ ╠═ Installing: Cloud Storage Command Line Tool ═╣ + +2025-07-10 13:34:12,564 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:13,143 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,156 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,168 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,178 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,189 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,225 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,239 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,259 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,275 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,291 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,309 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,322 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,334 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,344 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,354 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,372 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,386 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,396 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,408 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,420 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,430 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,443 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,454 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,466 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,476 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,487 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,498 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,512 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,529 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,542 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,554 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,576 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,593 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,615 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,632 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,652 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,664 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,677 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,717 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,734 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,746 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,757 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,768 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,781 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,795 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,807 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,819 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,831 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,843 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,858 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,875 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,889 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,904 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,919 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,932 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,948 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,965 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:13,987 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:14,005 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:14,024 INFO ___FILE_ONLY___ ═ +2025-07-10 13:34:14,024 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:14,056 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:14,056 INFO ___FILE_ONLY___ ╠═ Installing: Cloud Storage Command Line Tool (Platform... ═╣ + +2025-07-10 13:34:14,056 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:14,057 INFO ___FILE_ONLY___ ════════════════════════════════════════════════════════════ +2025-07-10 13:34:14,057 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:14,061 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:14,061 INFO ___FILE_ONLY___ ╠═ Installing: Default set of gcloud commands ═╣ + +2025-07-10 13:34:14,061 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:14,064 INFO ___FILE_ONLY___ ════════════════════════════════════════════════════════════ +2025-07-10 13:34:14,064 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:14,066 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:14,066 INFO ___FILE_ONLY___ ╠═ Installing: Google Cloud CLI Core Libraries (Platform... ═╣ + +2025-07-10 13:34:14,066 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:14,067 INFO ___FILE_ONLY___ ══════════════════════════════ +2025-07-10 13:34:14,067 INFO ___FILE_ONLY___ ══════════════════════════════ +2025-07-10 13:34:14,067 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:14,071 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:14,071 INFO ___FILE_ONLY___ ╠═ Installing: Google Cloud CRC32C Hash Tool ═╣ + +2025-07-10 13:34:14,071 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:14,074 INFO ___FILE_ONLY___ ════════════════════════════════════════════════════════════ +2025-07-10 13:34:14,074 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:14,075 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:14,075 INFO ___FILE_ONLY___ ╠═ Installing: Google Cloud CRC32C Hash Tool (Platform S... ═╣ + +2025-07-10 13:34:14,076 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:14,115 INFO ___FILE_ONLY___ ══════════════════════════════ +2025-07-10 13:34:14,116 INFO ___FILE_ONLY___ ══════════════════════════════ +2025-07-10 13:34:14,116 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:14,121 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:14,121 INFO ___FILE_ONLY___ ╠═ Installing: gcloud cli dependencies (Platform Specific) ═╣ + +2025-07-10 13:34:14,121 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:14,121 INFO ___FILE_ONLY___ ════════════════════════════════════════════════════════════ +2025-07-10 13:34:14,121 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:14,126 DEBUG root Updating notification cache... +2025-07-10 13:34:14,127 INFO ___FILE_ONLY___ + +2025-07-10 13:34:14,128 INFO ___FILE_ONLY___ Performing post processing steps... +2025-07-10 13:34:14,129 DEBUG root Executing command: ['/tools/google-cloud-sdk/bin/gcloud', 'components', 'post-process'] +2025-07-10 13:34:22,739 DEBUG ___FILE_ONLY___ +2025-07-10 13:34:22,739 DEBUG ___FILE_ONLY___ +2025-07-10 13:34:22,909 INFO root descriptor_list: [{'universeDomain': 'googleapis.com', 'universeShortName': '', 'authenticationDomain': 'auth.cloud.google.com', 'projectPrefix': '', 'cloudWebDomain': 'cloud.google.com', 'documentationDomain': 'cloud.google.com', 'version': '1.0.0', 'state': 'primary', 'artifactRegistryDomain': 'pkg.dev'}] +2025-07-10 13:34:22,909 INFO ___FILE_ONLY___ +Update done! + + +2025-07-10 13:34:22,912 DEBUG root Chosen display Format:none +2025-07-10 13:34:22,912 INFO root Display format: "none" diff --git a/.config/logs/2025.07.10/13.34.14.671755.log b/.config/logs/2025.07.10/13.34.14.671755.log new file mode 100644 index 0000000000000000000000000000000000000000..ba7d8f75b4157e82bd9f221ff9f2317b4dc06968 --- /dev/null +++ b/.config/logs/2025.07.10/13.34.14.671755.log @@ -0,0 +1,5 @@ +2025-07-10 13:34:14,672 DEBUG root Loaded Command Group: ['gcloud', 'components'] +2025-07-10 13:34:14,674 DEBUG root Loaded Command Group: ['gcloud', 'components', 'post_process'] +2025-07-10 13:34:14,676 DEBUG root Running [gcloud.components.post-process] with arguments: [] +2025-07-10 13:34:22,616 DEBUG root Chosen display Format:none +2025-07-10 13:34:22,616 INFO root Display format: "none" diff --git a/.config/logs/2025.07.10/13.34.23.548658.log b/.config/logs/2025.07.10/13.34.23.548658.log new file mode 100644 index 0000000000000000000000000000000000000000..bd929b93a82c1ab70fb25ad3ecca65aed6c9ffe3 --- /dev/null +++ b/.config/logs/2025.07.10/13.34.23.548658.log @@ -0,0 +1,153 @@ +2025-07-10 13:34:23,549 DEBUG root Loaded Command Group: ['gcloud', 'components'] +2025-07-10 13:34:23,551 DEBUG root Loaded Command Group: ['gcloud', 'components', 'update'] +2025-07-10 13:34:23,553 DEBUG root Running [gcloud.components.update] with arguments: [--quiet: "True", COMPONENT-IDS:8: "['gcloud', 'core', 'bq', 'gsutil', 'compute', 'preview', 'alpha', 'beta']"] +2025-07-10 13:34:23,554 INFO ___FILE_ONLY___ Beginning update. This process may take several minutes. + +2025-07-10 13:34:23,562 DEBUG urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443 +2025-07-10 13:34:23,747 DEBUG urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components-2.json HTTP/11" 200 239798 +2025-07-10 13:34:23,761 WARNING root Component [compute] no longer exists. +2025-07-10 13:34:23,762 INFO ___FILE_ONLY___ + +2025-07-10 13:34:23,762 INFO ___FILE_ONLY___ +Your current Google Cloud CLI version is: 529.0.0 + +2025-07-10 13:34:23,762 INFO ___FILE_ONLY___ Installing components from version: 529.0.0 + +2025-07-10 13:34:23,762 INFO ___FILE_ONLY___ + +2025-07-10 13:34:23,763 DEBUG root Chosen display Format:table[box,title="These components will be removed."](details.display_name:label=Name:align=left,version.version_string:label=Version:align=right,data.size.size(zero="",min=1048576):label=Size:align=right) +2025-07-10 13:34:23,763 DEBUG root Chosen display Format:table[box,title="These components will be updated."](details.display_name:label=Name:align=left,version.version_string:label=Version:align=right,data.size.size(zero="",min=1048576):label=Size:align=right) +2025-07-10 13:34:23,763 DEBUG root Chosen display Format:table[box,title="These components will be installed."](details.display_name:label=Name:align=left,version.version_string:label=Version:align=right,data.size.size(zero="",min=1048576):label=Size:align=right) +2025-07-10 13:34:23,778 INFO ___FILE_ONLY___ ┌────────────────────────────────────────────────┐ +2025-07-10 13:34:23,778 INFO ___FILE_ONLY___ + +2025-07-10 13:34:23,778 INFO ___FILE_ONLY___ │ These components will be installed. │ +2025-07-10 13:34:23,778 INFO ___FILE_ONLY___ + +2025-07-10 13:34:23,778 INFO ___FILE_ONLY___ ├─────────────────────────┬────────────┬─────────┤ +2025-07-10 13:34:23,778 INFO ___FILE_ONLY___ + +2025-07-10 13:34:23,778 INFO ___FILE_ONLY___ │ Name │ Version │ Size │ +2025-07-10 13:34:23,778 INFO ___FILE_ONLY___ + +2025-07-10 13:34:23,778 INFO ___FILE_ONLY___ ├─────────────────────────┼────────────┼─────────┤ +2025-07-10 13:34:23,778 INFO ___FILE_ONLY___ + +2025-07-10 13:34:23,778 INFO ___FILE_ONLY___ │ +2025-07-10 13:34:23,778 INFO ___FILE_ONLY___ gcloud Alpha Commands +2025-07-10 13:34:23,778 INFO ___FILE_ONLY___ +2025-07-10 13:34:23,778 INFO ___FILE_ONLY___ │ +2025-07-10 13:34:23,778 INFO ___FILE_ONLY___ 2025.06.27 +2025-07-10 13:34:23,778 INFO ___FILE_ONLY___ +2025-07-10 13:34:23,779 INFO ___FILE_ONLY___ │ +2025-07-10 13:34:23,779 INFO ___FILE_ONLY___ < 1 MiB +2025-07-10 13:34:23,779 INFO ___FILE_ONLY___ +2025-07-10 13:34:23,779 INFO ___FILE_ONLY___ │ +2025-07-10 13:34:23,779 INFO ___FILE_ONLY___ + +2025-07-10 13:34:23,779 INFO ___FILE_ONLY___ │ +2025-07-10 13:34:23,779 INFO ___FILE_ONLY___ gcloud Beta Commands +2025-07-10 13:34:23,779 INFO ___FILE_ONLY___ +2025-07-10 13:34:23,779 INFO ___FILE_ONLY___ │ +2025-07-10 13:34:23,779 INFO ___FILE_ONLY___ 2025.06.27 +2025-07-10 13:34:23,779 INFO ___FILE_ONLY___ +2025-07-10 13:34:23,779 INFO ___FILE_ONLY___ │ +2025-07-10 13:34:23,779 INFO ___FILE_ONLY___ < 1 MiB +2025-07-10 13:34:23,779 INFO ___FILE_ONLY___ +2025-07-10 13:34:23,779 INFO ___FILE_ONLY___ │ +2025-07-10 13:34:23,779 INFO ___FILE_ONLY___ + +2025-07-10 13:34:23,779 INFO ___FILE_ONLY___ │ +2025-07-10 13:34:23,779 INFO ___FILE_ONLY___ gcloud Preview Commands +2025-07-10 13:34:23,780 INFO ___FILE_ONLY___ +2025-07-10 13:34:23,780 INFO ___FILE_ONLY___ │ +2025-07-10 13:34:23,780 INFO ___FILE_ONLY___ +2025-07-10 13:34:23,780 INFO ___FILE_ONLY___ +2025-07-10 13:34:23,780 INFO ___FILE_ONLY___ │ +2025-07-10 13:34:23,780 INFO ___FILE_ONLY___ < 1 MiB +2025-07-10 13:34:23,780 INFO ___FILE_ONLY___ +2025-07-10 13:34:23,780 INFO ___FILE_ONLY___ │ +2025-07-10 13:34:23,780 INFO ___FILE_ONLY___ + +2025-07-10 13:34:23,780 INFO ___FILE_ONLY___ └─────────────────────────┴────────────┴─────────┘ +2025-07-10 13:34:23,780 INFO ___FILE_ONLY___ + +2025-07-10 13:34:23,780 INFO ___FILE_ONLY___ + +2025-07-10 13:34:23,783 DEBUG urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443 +2025-07-10 13:34:24,873 DEBUG urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/RELEASE_NOTES HTTP/11" 200 1444035 +2025-07-10 13:34:25,334 INFO ___FILE_ONLY___ For the latest full release notes, please visit: + https://cloud.google.com/sdk/release_notes + + +2025-07-10 13:34:25,335 INFO ___FILE_ONLY___ Performing in place update... + + +2025-07-10 13:34:25,337 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:25,337 INFO ___FILE_ONLY___ ╠═ Downloading: gcloud Alpha Commands ═╣ + +2025-07-10 13:34:25,337 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:25,340 DEBUG urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443 +2025-07-10 13:34:26,465 DEBUG urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-alpha-20250627154417.tar.gz HTTP/11" 200 800 +2025-07-10 13:34:26,465 INFO ___FILE_ONLY___ ════════════════════════════════════════════════════════════ +2025-07-10 13:34:26,465 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:26,467 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:26,467 INFO ___FILE_ONLY___ ╠═ Downloading: gcloud Beta Commands ═╣ + +2025-07-10 13:34:26,468 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:26,471 DEBUG urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443 +2025-07-10 13:34:27,534 DEBUG urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-beta-20250627154417.tar.gz HTTP/11" 200 797 +2025-07-10 13:34:27,535 INFO ___FILE_ONLY___ ════════════════════════════════════════════════════════════ +2025-07-10 13:34:27,535 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:27,537 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:27,537 INFO ___FILE_ONLY___ ╠═ Downloading: gcloud Preview Commands ═╣ + +2025-07-10 13:34:27,537 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:27,540 DEBUG urllib3.connectionpool Starting new HTTPS connection (1): dl.google.com:443 +2025-07-10 13:34:28,637 DEBUG urllib3.connectionpool https://dl.google.com:443 "GET /dl/cloudsdk/channels/rapid/components/google-cloud-sdk-preview-20241115154308.tar.gz HTTP/11" 200 823 +2025-07-10 13:34:28,638 INFO ___FILE_ONLY___ ════════════════════════════════════════════════════════════ +2025-07-10 13:34:28,638 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:28,640 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:28,640 INFO ___FILE_ONLY___ ╠═ Installing: gcloud Alpha Commands ═╣ + +2025-07-10 13:34:28,640 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:28,641 INFO ___FILE_ONLY___ ════════════════════════════════════════════════════════════ +2025-07-10 13:34:28,641 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:28,647 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:28,647 INFO ___FILE_ONLY___ ╠═ Installing: gcloud Beta Commands ═╣ + +2025-07-10 13:34:28,647 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:28,648 INFO ___FILE_ONLY___ ════════════════════════════════════════════════════════════ +2025-07-10 13:34:28,648 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:28,652 INFO ___FILE_ONLY___ ╔════════════════════════════════════════════════════════════╗ + +2025-07-10 13:34:28,652 INFO ___FILE_ONLY___ ╠═ Installing: gcloud Preview Commands ═╣ + +2025-07-10 13:34:28,652 INFO ___FILE_ONLY___ ╚ +2025-07-10 13:34:28,653 INFO ___FILE_ONLY___ ════════════════════════════════════════════════════════════ +2025-07-10 13:34:28,653 INFO ___FILE_ONLY___ ╝ + +2025-07-10 13:34:28,658 DEBUG root Updating notification cache... +2025-07-10 13:34:28,658 INFO ___FILE_ONLY___ + +2025-07-10 13:34:28,660 INFO ___FILE_ONLY___ Performing post processing steps... +2025-07-10 13:34:28,660 DEBUG root Executing command: ['/tools/google-cloud-sdk/bin/gcloud', 'components', 'post-process'] +2025-07-10 13:34:37,066 DEBUG ___FILE_ONLY___ +2025-07-10 13:34:37,067 DEBUG ___FILE_ONLY___ +2025-07-10 13:34:37,191 INFO root descriptor_list: [{'universeDomain': 'googleapis.com', 'universeShortName': '', 'authenticationDomain': 'auth.cloud.google.com', 'projectPrefix': '', 'cloudWebDomain': 'cloud.google.com', 'documentationDomain': 'cloud.google.com', 'version': '1.0.0', 'state': 'primary', 'artifactRegistryDomain': 'pkg.dev'}] +2025-07-10 13:34:37,191 INFO ___FILE_ONLY___ +Update done! + + +2025-07-10 13:34:37,193 DEBUG root Chosen display Format:none +2025-07-10 13:34:37,194 INFO root Display format: "none" diff --git a/.config/logs/2025.07.10/13.34.29.179991.log b/.config/logs/2025.07.10/13.34.29.179991.log new file mode 100644 index 0000000000000000000000000000000000000000..3609adaa812d3ec59e75f06249bb0a952a368fbd --- /dev/null +++ b/.config/logs/2025.07.10/13.34.29.179991.log @@ -0,0 +1,5 @@ +2025-07-10 13:34:29,180 DEBUG root Loaded Command Group: ['gcloud', 'components'] +2025-07-10 13:34:29,182 DEBUG root Loaded Command Group: ['gcloud', 'components', 'post_process'] +2025-07-10 13:34:29,183 DEBUG root Running [gcloud.components.post-process] with arguments: [] +2025-07-10 13:34:36,952 DEBUG root Chosen display Format:none +2025-07-10 13:34:36,953 INFO root Display format: "none" diff --git a/.config/logs/2025.07.10/13.34.37.820504.log b/.config/logs/2025.07.10/13.34.37.820504.log new file mode 100644 index 0000000000000000000000000000000000000000..6f39606ab5880f54ad41912c38ef67a1d29ba311 --- /dev/null +++ b/.config/logs/2025.07.10/13.34.37.820504.log @@ -0,0 +1,8 @@ +2025-07-10 13:34:37,822 DEBUG root Loaded Command Group: ['gcloud', 'config'] +2025-07-10 13:34:37,872 DEBUG root Loaded Command Group: ['gcloud', 'config', 'set'] +2025-07-10 13:34:37,875 DEBUG root Running [gcloud.config.set] with arguments: [SECTION/PROPERTY: "component_manager/disable_update_check", VALUE: "true"] +2025-07-10 13:34:37,876 INFO ___FILE_ONLY___ Updated property [component_manager/disable_update_check]. + +2025-07-10 13:34:37,876 DEBUG root Chosen display Format:default +2025-07-10 13:34:37,877 INFO root Display format: "default" +2025-07-10 13:34:37,877 DEBUG root SDK update checks are disabled. diff --git a/.config/logs/2025.07.10/13.34.38.537881.log b/.config/logs/2025.07.10/13.34.38.537881.log new file mode 100644 index 0000000000000000000000000000000000000000..c7acac2873212078fc0c21d8769e3f735dd69a20 --- /dev/null +++ b/.config/logs/2025.07.10/13.34.38.537881.log @@ -0,0 +1,8 @@ +2025-07-10 13:34:38,539 DEBUG root Loaded Command Group: ['gcloud', 'config'] +2025-07-10 13:34:38,585 DEBUG root Loaded Command Group: ['gcloud', 'config', 'set'] +2025-07-10 13:34:38,588 DEBUG root Running [gcloud.config.set] with arguments: [SECTION/PROPERTY: "compute/gce_metadata_read_timeout_sec", VALUE: "0"] +2025-07-10 13:34:38,588 INFO ___FILE_ONLY___ Updated property [compute/gce_metadata_read_timeout_sec]. + +2025-07-10 13:34:38,589 DEBUG root Chosen display Format:default +2025-07-10 13:34:38,590 INFO root Display format: "default" +2025-07-10 13:34:38,590 DEBUG root SDK update checks are disabled. diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..cdf4c901ce4c9fe65d53471ead31a00f54dbed33 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +unsloth_compiled_cache/__pycache__/UnslothDPOTrainer.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +unsloth_compiled_cache/__pycache__/UnslothGRPOTrainer.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text +huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/blobs/6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b filter=lfs diff=lfs merge=lfs -text +huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/snapshots/26672447463f314a180021ded1522a55ce5b1090/tokenizer.json filter=lfs diff=lfs merge=lfs -text +chroma_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text diff --git a/chroma_db/chroma.sqlite3 b/chroma_db/chroma.sqlite3 new file mode 100644 index 0000000000000000000000000000000000000000..38b7742b96c7d2f6272c3418076e19bfb29d4bb3 --- /dev/null +++ b/chroma_db/chroma.sqlite3 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e877e959dc5e3a3655d9c163ffc335a3bf0ee5c284e175908884b56de795094f +size 52137984 diff --git a/chroma_db/eecd79f6-a7d8-4f49-af31-af17bc062218/data_level0.bin b/chroma_db/eecd79f6-a7d8-4f49-af31-af17bc062218/data_level0.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6b4d759c84516f91c516a3ec7b0bbcf9247ada0 --- /dev/null +++ b/chroma_db/eecd79f6-a7d8-4f49-af31-af17bc062218/data_level0.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:673b2e64e86d2b7f49b2013fb7295df8d99487851daf1bd66e6f8926505ce4ca +size 62840000 diff --git a/chroma_db/eecd79f6-a7d8-4f49-af31-af17bc062218/header.bin b/chroma_db/eecd79f6-a7d8-4f49-af31-af17bc062218/header.bin new file mode 100644 index 0000000000000000000000000000000000000000..a150f7e417c6d622d026ba53c1ade741e89166b5 --- /dev/null +++ b/chroma_db/eecd79f6-a7d8-4f49-af31-af17bc062218/header.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:013f19a73d5a9d9665fde305a5998ed6dfc63d6321acd0a84d63d36b0574fc1a +size 100 diff --git a/chroma_db/eecd79f6-a7d8-4f49-af31-af17bc062218/index_metadata.pickle b/chroma_db/eecd79f6-a7d8-4f49-af31-af17bc062218/index_metadata.pickle new file mode 100644 index 0000000000000000000000000000000000000000..24fae80388e158d4a9bb0a5bcf7fde6a1021d3e8 --- /dev/null +++ b/chroma_db/eecd79f6-a7d8-4f49-af31-af17bc062218/index_metadata.pickle @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf15885b30a230a1f920f5769d88c61b196904264f336c05c40e33066385a9f9 +size 277704 diff --git a/chroma_db/eecd79f6-a7d8-4f49-af31-af17bc062218/length.bin b/chroma_db/eecd79f6-a7d8-4f49-af31-af17bc062218/length.bin new file mode 100644 index 0000000000000000000000000000000000000000..57a0bb4889b442e48281d402b53360287b516c79 --- /dev/null +++ b/chroma_db/eecd79f6-a7d8-4f49-af31-af17bc062218/length.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3a9b6ddccef1879c12ad9fa60332b1854c21afc84391da9dd9dce6e50489857 +size 40000 diff --git a/chroma_db/eecd79f6-a7d8-4f49-af31-af17bc062218/link_lists.bin b/chroma_db/eecd79f6-a7d8-4f49-af31-af17bc062218/link_lists.bin new file mode 100644 index 0000000000000000000000000000000000000000..952d00ff5d44c26c7b82c3687f9df1bb23f626c4 --- /dev/null +++ b/chroma_db/eecd79f6-a7d8-4f49-af31-af17bc062218/link_lists.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b5f297a1102825b526f19648314675dd2f8b88595126380651a22fcc73c6b2f +size 27028 diff --git a/docstore/0067c469-f97a-4b83-8e90-f8f29ec88b9c b/docstore/0067c469-f97a-4b83-8e90-f8f29ec88b9c new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/0067c469-f97a-4b83-8e90-f8f29ec88b9c @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/006c58ea-a83a-4cb5-8316-31c02cc01c70 b/docstore/006c58ea-a83a-4cb5-8316-31c02cc01c70 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/006c58ea-a83a-4cb5-8316-31c02cc01c70 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/009bc6d7-eaa6-4ead-a174-e5d6f058dfea b/docstore/009bc6d7-eaa6-4ead-a174-e5d6f058dfea new file mode 100644 index 0000000000000000000000000000000000000000..750217269b5a3e53dd2d92de822b68418a6799df --- /dev/null +++ b/docstore/009bc6d7-eaa6-4ead-a174-e5d6f058dfea @@ -0,0 +1 @@ +' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Get metadata for a file You can verify that the API successfully stored the uploaded file and get its metadata by calling files.get . Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) file_name = myfile . name myfile = client . files . get ( name = file_name ) print ( myfile ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; const fetchedFile = await ai . files . get ({ name : fileName }); console . log ( fetchedFile ); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } gotFile , err := client . GetFile ( ctx , file . Name ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Got file:" , gotFile . Name ) REST # file_info.json was created in the upload example name = $( jq ".file.name" file_info.json ) # Get the file of interest to check state curl https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " > file_info.json # Print some information about the file you got name = $( jq ".file.name" file_info.json ) echo name = $name file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri List uploaded files You can upload multiple files using the Files API. The following code gets a list of all the files uploaded: Python print ( 'My files:' ) for f in client . files . list (): print ( ' ' , f . name ) JavaScript const listResponse = await ai . files . list ({ config : { pageSize : 10 } }); for await ( const file of listResponse ) { console . log ( file . name ); } Go iter := client . ListFiles ( ctx ) for { ifile , err := iter . Next () if err == iterator . Done { break } if err != nil { log . Fatal ( err ) } fmt . Println ( ifile . Name ) } REST echo "My files: " curl \ No newline at end of file diff --git a/docstore/00a07419-5f3b-46c3-9e8a-6541d35d620e b/docstore/00a07419-5f3b-46c3-9e8a-6541d35d620e new file mode 100644 index 0000000000000000000000000000000000000000..2ea000ef37ce0b12cd8c95d1c4d56237f0f71002 --- /dev/null +++ b/docstore/00a07419-5f3b-46c3-9e8a-6541d35d620e @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/billing Title: Billing | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/00b35170-69f6-403d-ad95-2ca64c3399f5 b/docstore/00b35170-69f6-403d-ad95-2ca64c3399f5 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/00b35170-69f6-403d-ad95-2ca64c3399f5 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/00ceea18-bfce-4340-a019-233a30ab30f7 b/docstore/00ceea18-bfce-4340-a019-233a30ab30f7 new file mode 100644 index 0000000000000000000000000000000000000000..facd4f50718f089582ed06a7b9e2ea144ce56d9c --- /dev/null +++ b/docstore/00ceea18-bfce-4340-a019-233a30ab30f7 @@ -0,0 +1 @@ +instructions, such as: "Show bounding boxes of all green objects in this image". It also support custom labels like "label the items with the allergens they can contain". For more examples, check following notebooks in the Gemini Cookbook : 2D spatial understanding notebook Experimental 3D pointing notebook Segmentation Starting with Gemini 2.5, models not only detect items but also segment them and provide their contour masks. The model predicts a JSON list, where each item represents a segmentation mask. Each item has a bounding box (" box_2d ") in the format [y0, x0, y1, x1] with normalized coordinates between 0 and 1000, a label (" label ") that identifies the object, and finally the segmentation mask inside the bounding box, as base64 encoded png that is a probability map with values between 0 and 255. The mask needs to be resized to match the bounding box dimensions, then binarized at your confidence threshold (127 for the midpoint). Note: For better results, disable thinking by setting the thinking budget to 0. See code sample below for an example. Python from google import genai from google.genai import types from PIL import Image , ImageDraw import io import base64 import json import numpy as np import os client = genai . Client () def parse_json ( json_output : str ): # Parsing out the markdown fencing lines = json_output . splitlines () for i , line in enumerate ( lines ): if line == "```json" : json_output = " \n " . join ( lines [ i + 1 :]) # Remove everything before "```json" output = json_output . split ( "```" )[ 0 ] # Remove everything after the closing "```" break # Exit the loop once "```json" is found return json_output def extract_segmentation_masks ( image_path : str , output_dir : str = "segmentation_outputs" ): # Load and resize image im = Image . open ( image_path ) im . thumbnail ([ 1024 , 1024 ], Image . Resampling . LANCZOS ) prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks \ No newline at end of file diff --git a/docstore/0139abe7-0f1b-443f-b7a1-5f425686a7b8 b/docstore/0139abe7-0f1b-443f-b7a1-5f425686a7b8 new file mode 100644 index 0000000000000000000000000000000000000000..c19e2d1f2a8624bd1e7529d33c658d3154e40942 --- /dev/null +++ b/docstore/0139abe7-0f1b-443f-b7a1-5f425686a7b8 @@ -0,0 +1 @@ +response. A token is approximately four characters. 100 tokens correspond to roughly 60-80 words. Temperature: The temperature controls the degree of randomness in token selection. The temperature is used for sampling during response generation, which occurs when topP and topK are applied. Lower temperatures are good for prompts that require a more deterministic or less open-ended response, while higher temperatures can lead to more diverse or creative results. A temperature of 0 is deterministic, meaning that the highest probability response is always selected. topK : The topK parameter changes how the model selects tokens for output. A topK of 1 means the selected token is the most probable among all the tokens in the model's vocabulary (also called greedy decoding), while a topK of 3 means that the next token is selected from among the 3 most probable using the temperature. For each token selection step, the topK tokens with the highest probabilities are sampled. Tokens are then further filtered based on topP with the final token selected using temperature sampling. topP : The topP parameter changes how the model selects tokens for output. Tokens are selected from the most to least probable until the sum of their probabilities equals the topP value. For example, if tokens A, B, and C have a probability of 0.3, 0.2, and 0.1 and the topP value is 0.5, then the model will select either A or B as the next token by using the temperature and exclude C as a candidate. The default topP value is 0.95. stop_sequences : Set a stop sequence to tell the model to stop generating content. A stop sequence can be any sequence of characters. Try to avoid using a sequence of characters that may appear in the generated content. Prompt iteration strategies Prompt design can sometimes require a few iterations before you consistently get the response you're looking for. This section provides guidance on some things you can try when iterating on your prompts: Use different phrasing: \ No newline at end of file diff --git a/docstore/014320e7-ad22-4ecf-a6ed-2f461a6c1705 b/docstore/014320e7-ad22-4ecf-a6ed-2f461a6c1705 new file mode 100644 index 0000000000000000000000000000000000000000..6d0fb3dda30537d366b10ce141412a20984aa796 --- /dev/null +++ b/docstore/014320e7-ad22-4ecf-a6ed-2f461a6c1705 @@ -0,0 +1 @@ +where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels. """ config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # set thinking_budget to 0 for better results in object detection ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ prompt , im ], # Pillow images can be directly passed as inputs (which will be converted by the SDK) config = config ) # Parse JSON response items = json . loads ( parse_json ( response . text )) # Create output directory os . makedirs ( output_dir , exist_ok = True ) # Process each mask for i , item in enumerate ( items ): # Get bounding box coordinates box = item [ "box_2d" ] y0 = int ( box [ 0 ] / 1000 * im . size [ 1 ]) x0 = int ( box [ 1 ] / 1000 * im . size [ 0 ]) y1 = int ( box [ 2 ] / 1000 * im . size [ 1 ]) x1 = int ( box [ 3 ] / 1000 * im . size [ 0 ]) # Skip invalid boxes if y0 > = y1 or x0 > = x1 : continue # Process mask png_str = item [ "mask" ] if not png_str . startswith ( "data:image/png;base64," ): continue # Remove prefix png_str = png_str . removeprefix ( "data:image/png;base64," ) mask_data = base64 . b64decode ( png_str ) mask = Image . open ( io . BytesIO ( mask_data )) # Resize mask to match bounding box mask = mask . resize (( x1 - x0 , y1 - y0 ), Image . Resampling . BILINEAR ) # Convert mask to numpy array for processing mask_array = np . array ( mask ) # Create overlay for this mask overlay = Image . new ( 'RGBA' , im . size , ( 0 , 0 , 0 , 0 )) overlay_draw = ImageDraw . Draw ( overlay ) # Create overlay for the mask color = ( 255 , 255 , 255 , 200 ) for y in range ( y0 , y1 ): for x in range ( x0 , x1 ): if mask_array [ y - y0 , x - x0 ] > 128 : # Threshold for mask overlay_draw . point (( x , y ), fill = color ) # Save individual mask and its overlay mask_filename = f " { item [ 'label' ] } _ { i } _mask.png" \ No newline at end of file diff --git a/docstore/014d1118-fffc-4f45-bacf-d5ff55838bf4 b/docstore/014d1118-fffc-4f45-bacf-d5ff55838bf4 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/014d1118-fffc-4f45-bacf-d5ff55838bf4 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/01524be3-5603-4c0f-8afe-7a2590e3f4a2 b/docstore/01524be3-5603-4c0f-8afe-7a2590e3f4a2 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/01524be3-5603-4c0f-8afe-7a2590e3f4a2 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/015f8fee-f8f3-4ad4-bfce-d523433342e7 b/docstore/015f8fee-f8f3-4ad4-bfce-d523433342e7 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/015f8fee-f8f3-4ad4-bfce-d523433342e7 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/018cc2a1-fad6-4a7c-b90b-193c0662abcc b/docstore/018cc2a1-fad6-4a7c-b90b-193c0662abcc new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/018cc2a1-fad6-4a7c-b90b-193c0662abcc @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/019f16a2-c460-46a5-834e-e2cfd9943d27 b/docstore/019f16a2-c460-46a5-834e-e2cfd9943d27 new file mode 100644 index 0000000000000000000000000000000000000000..44112a5590578bd0d81bd7d1053c465e40ca11dd --- /dev/null +++ b/docstore/019f16a2-c460-46a5-834e-e2cfd9943d27 @@ -0,0 +1 @@ +tokens for other Live API models Supported languages Live API supports the following languages. Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Language BCP-47 Code Language BCP-47 Code German (Germany) de-DE English (Australia)* en-AU English (UK)* en-GB English (India) en-IN English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Portuguese (Brazil) pt-BR Arabic (Generic) ar-XA Spanish (Spain)* es-ES French (Canada)* fr-CA Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Bengali (India) bn-IN Gujarati (India)* gu-IN Kannada (India)* kn-IN Marathi (India) mr-IN Malayalam (India)* ml-IN Tamil (India) ta-IN Telugu (India) te-IN Dutch (Netherlands) nl-NL Korean (South Korea) ko-KR Mandarin Chinese (China)* cmn-CN Polish (Poland) pl-PL Russian (Russia) ru-RU Thai (Thailand) th-TH Languages marked with an asterisk (*) are not available for Native audio . What's next Read the Tool Use and Session Management guides for essential information on using the Live API effectively. Try the Live API in Google AI Studio . For more info about the Live API models, see Gemini 2.0 Flash Live and Gemini 2.5 Flash Native Audio on the Models page. Try more examples in the Live API cookbook , the Live API Tools cookbook , and the Live API Get Started script . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/01a3d852-18c4-4010-be44-de84fc65a7f8 b/docstore/01a3d852-18c4-4010-be44-de84fc65a7f8 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/01a3d852-18c4-4010-be44-de84fc65a7f8 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/01b07962-66fe-4114-a824-4e927fbdf77f b/docstore/01b07962-66fe-4114-a824-4e927fbdf77f new file mode 100644 index 0000000000000000000000000000000000000000..150f8758ce4500c63fdc2d62f5bb812ca3b2d976 --- /dev/null +++ b/docstore/01b07962-66fe-4114-a824-4e927fbdf77f @@ -0,0 +1 @@ +client-side (browser based) applications // Consider using Ephemeral Tokens instead // More information at: https://ai.google.dev/gemini-api/docs/ephemeral-tokens // Half cascade model: // const model = "gemini-live-2.5-flash-preview" // Native audio output model: const model = "gemini-2.5-flash-preview-native-audio-dialog" const config = { responseModalities : [ Modality . AUDIO ], systemInstruction : "You are a helpful assistant and answer in a friendly tone." }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); \ No newline at end of file diff --git a/docstore/01bd8c4d-19d8-43c2-90ff-0b99912b7b16 b/docstore/01bd8c4d-19d8-43c2-90ff-0b99912b7b16 new file mode 100644 index 0000000000000000000000000000000000000000..b99824a0bb181cb1be6367ec11bfeefdd4ec4b3d --- /dev/null +++ b/docstore/01bd8c4d-19d8-43c2-90ff-0b99912b7b16 @@ -0,0 +1 @@ +AUDIO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Describe this audio clip"}, {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": ' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json To learn more about working with media files, see Files API . Pass audio data inline Instead of uploading an audio file, you can pass inline audio data in the request to generateContent : Python from google.genai import types with open ( 'path/to/small-sample.mp3' , 'rb' ) as f : audio_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ 'Describe this audio clip' , types . Part . from_bytes ( data = audio_bytes , mime_type = 'audio/mp3' , ) ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const base64AudioFile = fs . readFileSync ( "path/to/small-sample.mp3" , { encoding : "base64" , }); const contents = [ { text : "Please summarize the audio." }, { inlineData : { mimeType : "audio/mp3" , data : base64AudioFile , }, }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } audioBytes , _ := os . ReadFile ( "/path/to/small-sample.mp3" ) parts := [] * genai . Part { genai . NewPartFromText ( "Describe this audio clip" ), & genai . \ No newline at end of file diff --git a/docstore/01bf8b4d-c2e1-4afb-84ce-f3965f9c0d14 b/docstore/01bf8b4d-c2e1-4afb-84ce-f3965f9c0d14 new file mode 100644 index 0000000000000000000000000000000000000000..fcd49fd9d1e1bfa6316de012e7df2b50b5ffda8c --- /dev/null +++ b/docstore/01bf8b4d-c2e1-4afb-84ce-f3965f9c0d14 @@ -0,0 +1 @@ +from_cached_content ( cached_content = apollo_cache ) response = apollo_model . generate_content ( "Find a lighthearted moment from this transcript" ) JavaScript import { GoogleAICacheManager , GoogleAIFileManager } from "@google/generative-ai/server" ; import { GoogleGenerativeAI } from "@google/generative-ai" ; const cacheManager = new GoogleAICacheManager ( "GOOGLE_API_KEY" ); const fileManager = new GoogleAIFileManager ( "GOOGLE_API_KEY" ); const uploadResult = await fileManager . uploadFile ( "path/to/a11.txt" , { mimeType : "text/plain" , }); const cacheResult = await cacheManager . create ({ model : "models/gemini-1.5-flash" , contents : [ { role : "user" , parts : [ { fileData : { fileUri : uploadResult . file . uri , mimeType : uploadResult . file . mimeType , }, }, ], }, ], }); console . log ( cacheResult ); const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModelFromCachedContent ( cacheResult ); const result = await model . generateContent ( "Please summarize this transcript." , ); console . log ( result . response . text ()); After Python import requests import pathlib from google import genai from google.genai import types client = genai . Client () # Check which models support caching. for m in client . models . list (): for action in m . supported_actions : if action == "createCachedContent" : print ( m . name ) break # Download file response = requests . get ( 'https://storage.googleapis.com/generativeai-downloads/data/a11.txt' ) pathlib . Path ( 'a11.txt' ) . write_text ( response . text ) # Upload file document = client . files . upload ( file = 'a11.txt' ) # Create cache model = 'gemini-1.5-flash-001' apollo_cache = client . caches . create ( model = model , config = { 'contents' : [ document ], 'system_instruction' : 'You are an expert at analyzing transcripts.' , }, ) # Generate response response = client . models . generate_content ( model = model , contents = 'Find a lighthearted moment from this \ No newline at end of file diff --git a/docstore/01d25c80-25ba-4148-9cb6-0c19a722a776 b/docstore/01d25c80-25ba-4148-9cb6-0c19a722a776 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/01d25c80-25ba-4148-9cb6-0c19a722a776 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/0201f6ab-5cd3-47a2-88b2-da4da07a432d b/docstore/0201f6ab-5cd3-47a2-88b2-da4da07a432d new file mode 100644 index 0000000000000000000000000000000000000000..45d17d6ac3f5b7951c085e76c40e76fbe5fe62ea --- /dev/null +++ b/docstore/0201f6ab-5cd3-47a2-88b2-da4da07a432d @@ -0,0 +1 @@ +"thinkingConfig": { "thinkingBudget": 1024 # Thinking off: # "thinkingBudget": 0 # Turn on dynamic thinking: # "thinkingBudget": -1 } } }' Thought summaries Thought summaries are synthesized versions of the model's raw thoughts and offer insights into the model's internal reasoning process. Note that thinking budgets apply to the model's raw thoughts and not to thought summaries. You can enable thought summaries by setting includeThoughts to true in your request configuration. You can then access the summary by iterating through the response parameter's parts , and checking the thought boolean. Here's an example demonstrating how to enable and retrieve thought summaries without streaming, which returns a single, final thought summary with the response: Python from google import genai from google.genai import types client = genai . Client () prompt = "What is the sum of the first 50 prime numbers?" response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = prompt , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( include_thoughts = True ) ) ) for part in response . candidates [ 0 ] . content . parts : if not part . text : continue if part . thought : print ( "Thought summary:" ) print ( part . text ) print () else : print ( "Answer:" ) print ( part . text ) print () JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : "What is the sum of the first 50 prime numbers?" , config : { thinkingConfig : { includeThoughts : true , }, }, }); for ( const part of response . candidates [ 0 ]. content . parts ) { if ( ! part . text ) { continue ; } else if ( part . thought ) { console . log ( "Thoughts summary:" ); console . log ( part . text ); } else { console . log ( "Answer:" ); console . log ( part . text ); } } } main (); Go package main import ( "context" \ No newline at end of file diff --git a/docstore/0210f190-906a-462c-863b-f39ab493f00c b/docstore/0210f190-906a-462c-863b-f39ab493f00c new file mode 100644 index 0000000000000000000000000000000000000000..7ab62d419d5012f3f655310e2eb3eaad5959edc1 --- /dev/null +++ b/docstore/0210f190-906a-462c-863b-f39ab493f00c @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-2.5-flash-native-audio Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/021a0594-6ae7-4e26-bef3-5ac5494805b7 b/docstore/021a0594-6ae7-4e26-bef3-5ac5494805b7 new file mode 100644 index 0000000000000000000000000000000000000000..805d10be613a3c55006267c8f5d9ce17fbca450d --- /dev/null +++ b/docstore/021a0594-6ae7-4e26-bef3-5ac5494805b7 @@ -0,0 +1 @@ +declaration can include the following parameters: name (string): A unique name for the function ( get_weather_forecast , send_email ). Use descriptive names without spaces or special characters (use underscores or camelCase). description (string): A clear and detailed explanation of the function's purpose and capabilities. This is crucial for the model to understand when to use the function. Be specific and provide examples if helpful ("Finds theaters based on location and optionally movie title which is currently playing in theaters."). parameters (object): Defines the input parameters the function expects. type (string): Specifies the overall data type, such as object . properties (object): Lists individual parameters, each with: type (string): The data type of the parameter, such as string , integer , boolean, array . description (string): A description of the parameter's purpose and format. Provide examples and constraints ("The city and state, e.g., 'San Francisco, CA' or a zip code e.g., '95616'."). enum (array, optional): If the parameter values are from a fixed set, use "enum" to list the allowed values instead of just describing them in the description. This improves accuracy ("enum": ["daylight", "cool", "warm"]). required (array): An array of strings listing the parameter names that are mandatory for the function to operate. Function calling with thinking Enabling "thinking" can improve function call performance by allowing the model to reason through a request before suggesting function calls. However, because the Gemini API is stateless, this reasoning context is lost between turns, which can reduce the quality of function calls as they require multiple turn requests. To preserve this context you can use thought signatures. A thought signature is an encrypted representation of the model's internal thought process that you pass back to the model on subsequent turns. To use thought signatures: Receive the signature: When thinking is enabled, the API \ No newline at end of file diff --git a/docstore/021b3007-41c1-47b8-9b1f-63b9264653a2 b/docstore/021b3007-41c1-47b8-9b1f-63b9264653a2 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/021b3007-41c1-47b8-9b1f-63b9264653a2 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/021bd55c-72f9-434d-9a17-e72a9244e94f b/docstore/021bd55c-72f9-434d-9a17-e72a9244e94f new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/021bd55c-72f9-434d-9a17-e72a9244e94f @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/022e94bc-3fb7-474c-aa2d-790462d62cb0 b/docstore/022e94bc-3fb7-474c-aa2d-790462d62cb0 new file mode 100644 index 0000000000000000000000000000000000000000..0ae258e3f4ca40ea8954afdd9da06087e388f8d9 --- /dev/null +++ b/docstore/022e94bc-3fb7-474c-aa2d-790462d62cb0 @@ -0,0 +1 @@ +generateContent ({ model : 'gemini-2.5-flash' , contents : content , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "io" "net/http" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , _ := genai . NewClient ( ctx , & genai . ClientConfig { APIKey : os . Getenv ( "GEMINI_API_KEY" ), Backend : genai . BackendGeminiAPI , }) docUrl1 := "https://arxiv.org/pdf/2312.11805" docUrl2 := "https://arxiv.org/pdf/2403.05530" localPath1 := "doc1_downloaded.pdf" localPath2 := "doc2_downloaded.pdf" respHttp1 , _ := http . Get ( docUrl1 ) defer respHttp1 . Body . Close () outFile1 , _ := os . Create ( localPath1 ) _ , _ = io . Copy ( outFile1 , respHttp1 . Body ) outFile1 . Close () respHttp2 , _ := http . Get ( docUrl2 ) defer respHttp2 . Body . Close () outFile2 , _ := os . Create ( localPath2 ) _ , _ = io . Copy ( outFile2 , respHttp2 . Body ) outFile2 . Close () uploadConfig1 := & genai . UploadFileConfig { MIMEType : "application/pdf" } uploadedFile1 , _ := client . Files . UploadFromPath ( ctx , localPath1 , uploadConfig1 ) uploadConfig2 := & genai . UploadFileConfig { MIMEType : "application/pdf" } uploadedFile2 , _ := client . Files . UploadFromPath ( ctx , localPath2 , uploadConfig2 ) promptParts := [] * genai . Part { genai . NewPartFromURI ( uploadedFile1 . URI , uploadedFile1 . MIMEType ), genai . NewPartFromURI ( uploadedFile2 . URI , uploadedFile2 . MIMEType ), genai . NewPartFromText ( "What is the difference between each of the " + "main benchmarks between these two papers? " + "Output these in a table." ), } contents := [] * genai . Content { genai . NewContentFromParts ( promptParts , genai . RoleUser ), } modelName := "gemini-2.5-flash" result , _ := client . Models . GenerateContent ( ctx , modelName , contents , nil , ) fmt . Println ( result . Text ()) } REST DOC_URL_1 = "https://arxiv.org/pdf/2312.11805" DOC_URL_2 = "https://arxiv.org/pdf/2403.05530" DISPLAY_NAME_1 = "Gemini_paper" \ No newline at end of file diff --git a/docstore/025538f8-2596-4e02-acd4-6e6c4b7a7880 b/docstore/025538f8-2596-4e02-acd4-6e6c4b7a7880 new file mode 100644 index 0000000000000000000000000000000000000000..a97f8e9b4e6ce1d85f8c4b9bed31a9676bb7c602 --- /dev/null +++ b/docstore/025538f8-2596-4e02-acd4-6e6c4b7a7880 @@ -0,0 +1 @@ +Files API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Files API The Gemini family of artificial intelligence (AI) models is built to handle various types of input data, including text, images, and audio. Since these models can handle more than one type or mode of data, the Gemini models are called multimodal models or explained as having multimodal capabilities . This guide shows you how to work with media files using the Files API. The basic operations are the same for audio files, images, videos, documents, and other supported file types. For file prompting guidance, check out the File prompt guide section. Upload a file You can use the Files API to upload a media file. Always use the Files API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20 MB. The following code uploads a file and then uses the file in a call to generateContent . Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp3" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ "Describe this audio clip" , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Describe this audio \ No newline at end of file diff --git a/docstore/025947a1-f493-4e2c-b5c4-004b42a7d108 b/docstore/025947a1-f493-4e2c-b5c4-004b42a7d108 new file mode 100644 index 0000000000000000000000000000000000000000..f1e5636b87347e50cd07537e6c2f618b0c2ce995 --- /dev/null +++ b/docstore/025947a1-f493-4e2c-b5c4-004b42a7d108 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/live-guide#native-audio-output Title: Live API capabilities guide | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/02666ab0-b248-4ee0-b01b-cd4bf1f27ec6 b/docstore/02666ab0-b248-4ee0-b01b-cd4bf1f27ec6 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/02666ab0-b248-4ee0-b01b-cd4bf1f27ec6 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/026e3d2d-c1e8-4954-9946-86b8e7ea4710 b/docstore/026e3d2d-c1e8-4954-9946-86b8e7ea4710 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/026e3d2d-c1e8-4954-9946-86b8e7ea4710 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/02805def-4645-473d-855e-f70eb07baae4 b/docstore/02805def-4645-473d-855e-f70eb07baae4 new file mode 100644 index 0000000000000000000000000000000000000000..5b10a49a34afcc5006e0bf4f1bcb0c14355ae334 --- /dev/null +++ b/docstore/02805def-4645-473d-855e-f70eb07baae4 @@ -0,0 +1 @@ +environment includes the following libraries: attrs chess contourpy fpdf geopandas imageio jinja2 joblib jsonschema jsonschema-specifications lxml matplotlib mpmath numpy opencv-python openpyxl packaging pandas pillow protobuf pylatex pyparsing PyPDF2 python-dateutil python-docx python-pptx reportlab scikit-learn scipy seaborn six striprtf sympy tabulate tensorflow toolz xlrd You can't install your own libraries. Note: Only matplotlib is supported for graph rendering using code execution. What's next Try the code execution Colab . Learn about other Gemini API tools: Function calling Grounding with Google Search Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/02a199a7-073a-4e41-8a26-7a31f4ebdb46 b/docstore/02a199a7-073a-4e41-8a26-7a31f4ebdb46 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/02a199a7-073a-4e41-8a26-7a31f4ebdb46 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/02a36934-4ec6-4346-aadb-f39bd6a6cf41 b/docstore/02a36934-4ec6-4346-aadb-f39bd6a6cf41 new file mode 100644 index 0000000000000000000000000000000000000000..40517314fd91c121847408df8a1f7fc600adf0b3 --- /dev/null +++ b/docstore/02a36934-4ec6-4346-aadb-f39bd6a6cf41 @@ -0,0 +1 @@ +string, "nullable": boolean, "enum": [ string ], "maxItems": integer, "minItems": integer, "properties": { string: { object (Schema) }, ... }, "required": [ string ], "propertyOrdering": [ string ], "items": { object (Schema) } } The Type of the schema must be one of the OpenAPI Data Types , or a union of those types (using anyOf ). Only a subset of fields is valid for each Type . The following list maps each Type to a subset of the fields that are valid for that type: string -> enum , format , nullable integer -> format , minimum , maximum , enum , nullable number -> format , minimum , maximum , enum , nullable boolean -> nullable array -> minItems , maxItems , items , nullable object -> properties , required , propertyOrdering , nullable Here are some example schemas showing valid type-and-field combinations: { "type" : "string" , "enum" : [ "a" , "b" , "c" ] } { "type" : "string" , "format" : "date-time" } { "type" : "integer" , "format" : "int64" } { "type" : "number" , "format" : "double" } { "type" : "boolean" } { "type" : "array" , "minItems" : 3 , "maxItems" : 3 , "items" : { "type" : ... } } { "type" : "object" , "properties" : { "a" : { "type" : ... }, "b" : { "type" : ... }, "c" : { "type" : ... } }, "nullable" : true , "required" : [ "c" ], "propertyOrdering" : [ "c" , "b" , "a" ] } For complete documentation of the Schema fields as they're used in the Gemini API, see the Schema reference . Property ordering Warning: When you're configuring a JSON schema, make sure to set propertyOrdering[] , and when you provide examples, make sure that the property ordering in the examples matches the schema. When you're working with JSON schemas in the Gemini API, the order of properties is important. By default, the API orders properties alphabetically and does not preserve the order in which the properties are defined (although the Google Gen AI SDKs may preserve this order). If you're providing examples to the model with a schema configured, and the property \ No newline at end of file diff --git a/docstore/02a8d338-caf7-4b03-85b8-a1fefe426879 b/docstore/02a8d338-caf7-4b03-85b8-a1fefe426879 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/02a8d338-caf7-4b03-85b8-a1fefe426879 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/02ab5455-cf27-4a0f-b217-bd771fa65ac7 b/docstore/02ab5455-cf27-4a0f-b217-bd771fa65ac7 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/02ab5455-cf27-4a0f-b217-bd771fa65ac7 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/02daecf4-f065-4fba-a62c-748db94bcb91 b/docstore/02daecf4-f065-4fba-a62c-748db94bcb91 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/02daecf4-f065-4fba-a62c-748db94bcb91 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/02dd9e11-6c77-43ad-9a1d-9b7a28651635 b/docstore/02dd9e11-6c77-43ad-9a1d-9b7a28651635 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/02dd9e11-6c77-43ad-9a1d-9b7a28651635 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/02eebb02-f52d-4c09-882c-9ef3cee8f577 b/docstore/02eebb02-f52d-4c09-882c-9ef3cee8f577 new file mode 100644 index 0000000000000000000000000000000000000000..5ea931970e1e44bb3498d17f9957ce8d114559a4 --- /dev/null +++ b/docstore/02eebb02-f52d-4c09-882c-9ef3cee8f577 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/files#troubleshooting Title: Files API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/02f14f0c-83f0-4433-a600-7c0f3e618c48 b/docstore/02f14f0c-83f0-4433-a600-7c0f3e618c48 new file mode 100644 index 0000000000000000000000000000000000000000..4f5a135f81a8ec2b4f9be3f8bcfa685a50149381 --- /dev/null +++ b/docstore/02f14f0c-83f0-4433-a600-7c0f3e618c48 @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key= $GOOGLE_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Can you add a few more lines to this poem?"}, {"file_data":{"mime_type": "application/pdf", "file_uri": ' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json You can verify the API successfully stored the uploaded file and get its metadata by calling files.get . Only the name (and by extension, the uri ) are unique. Python from google import genai import pathlib client = genai . Client () fpath = pathlib . Path ( 'example.txt' ) fpath . write_text ( 'hello' ) file = client . files . upload ( file = 'example.txt' ) file_info = client . files . get ( name = file . name ) print ( file_info . model_dump_json ( indent = 4 )) REST name = $( jq ".file.name" file_info.json ) # Get the file of interest to check state curl https://generativelanguage.googleapis.com/v1beta/files/ $name > file_info.json # Print some information about the file you got name = $( jq ".file.name" file_info.json ) echo name = $name file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri Passing multiple PDFs The Gemini API is capable of processing multiple PDF documents (up to 1000 pages) in a single request, as long as the combined size of the documents and the text prompt stays within the model's context window. Python from google import genai import io import httpx client = genai . Client () doc_url_1 = "https://arxiv.org/pdf/2312.11805" doc_url_2 = "https://arxiv.org/pdf/2403.05530" # Retrieve and upload both PDFs using the File API doc_data_1 = io . BytesIO ( httpx . get ( doc_url_1 ) . content ) doc_data_2 = io . BytesIO ( httpx . get ( doc_url_2 ) . content ) sample_pdf_1 = client . files . upload ( file = doc_data_1 , config = dict ( mime_type = 'application/pdf' ) ) sample_pdf_2 = client . files . \ No newline at end of file diff --git a/docstore/02f68230-8de9-485f-9551-fafec8ed9c8a b/docstore/02f68230-8de9-485f-9551-fafec8ed9c8a new file mode 100644 index 0000000000000000000000000000000000000000..150f8758ce4500c63fdc2d62f5bb812ca3b2d976 --- /dev/null +++ b/docstore/02f68230-8de9-485f-9551-fafec8ed9c8a @@ -0,0 +1 @@ +client-side (browser based) applications // Consider using Ephemeral Tokens instead // More information at: https://ai.google.dev/gemini-api/docs/ephemeral-tokens // Half cascade model: // const model = "gemini-live-2.5-flash-preview" // Native audio output model: const model = "gemini-2.5-flash-preview-native-audio-dialog" const config = { responseModalities : [ Modality . AUDIO ], systemInstruction : "You are a helpful assistant and answer in a friendly tone." }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); \ No newline at end of file diff --git a/docstore/02f99502-a1f6-450c-a734-3e3c029d0780 b/docstore/02f99502-a1f6-450c-a734-3e3c029d0780 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/02f99502-a1f6-450c-a734-3e3c029d0780 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/0300c86f-a947-4a28-b0cf-f90d70d7eda8 b/docstore/0300c86f-a947-4a28-b0cf-f90d70d7eda8 new file mode 100644 index 0000000000000000000000000000000000000000..ebc8fdc5ad27fd96758924c177eadfccc4d6556f --- /dev/null +++ b/docstore/0300c86f-a947-4a28-b0cf-f90d70d7eda8 @@ -0,0 +1 @@ +Structured output | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Structured output You can configure Gemini for structured output instead of unstructured text, allowing precise extraction and standardization of information for further processing. For example, you can use structured output to extract information from resumes, standardize them to build a structured database. Gemini can generate either JSON or enum values as structured output. Generating JSON There are two ways to generate JSON using the Gemini API: Configure a schema on the model Provide a schema in a text prompt Configuring a schema on the model is the recommended way to generate JSON, because it constrains the model to output JSON. Configuring a schema (recommended) To constrain the model to generate JSON, configure a responseSchema . The model will then respond to any prompt with JSON-formatted output. Python from google import genai from pydantic import BaseModel class Recipe ( BaseModel ): recipe_name : str ingredients : list [ str ] client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "List a few popular cookie recipes, and include the amounts of ingredients." , config = { "response_mime_type" : "application/json" , "response_schema" : list [ Recipe ], }, ) # Use the response as a JSON string. print ( response . text ) # Use instantiated objects. my_recipes : list [ Recipe ] = response . parsed Note: Pydantic validators are not yet supported. If a pydantic.ValidationError occurs, it is suppressed, and .parsed may be empty/null. JavaScript import { GoogleGenAI , Type } from "@google/genai" ; const ai = \ No newline at end of file diff --git a/docstore/030b1527-8ced-49d9-bd47-85ff7e03be82 b/docstore/030b1527-8ced-49d9-bd47-85ff7e03be82 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/030b1527-8ced-49d9-bd47-85ff7e03be82 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/031c04d8-f096-4d8f-b7aa-46c17510990a b/docstore/031c04d8-f096-4d8f-b7aa-46c17510990a new file mode 100644 index 0000000000000000000000000000000000000000..b0571e28c8e74f7e3e23139b08c0865b24edbd38 --- /dev/null +++ b/docstore/031c04d8-f096-4d8f-b7aa-46c17510990a @@ -0,0 +1 @@ +And you can also pass the schema as JSON: Python from google import genai client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = 'What type of instrument is an oboe?' , config = { 'response_mime_type' : 'text/x.enum' , 'response_schema' : { "type" : "STRING" , "enum" : [ "Percussion" , "String" , "Woodwind" , "Brass" , "Keyboard" ], }, }, ) print ( response . text ) # Woodwind Beyond basic multiple choice problems, you can use an enum anywhere in a JSON schema. For example, you could ask the model for a list of recipe titles and use a Grade enum to give each title a popularity grade: Python from google import genai import enum from pydantic import BaseModel class Grade ( enum . Enum ): A_PLUS = "a+" A = "a" B = "b" C = "c" D = "d" F = "f" class Recipe ( BaseModel ): recipe_name : str rating : Grade client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = 'List 10 home-baked cookie recipes and give them grades based on tastiness.' , config = { 'response_mime_type' : 'application/json' , 'response_schema' : list [ Recipe ], }, ) print ( response . text ) The response might look like this: [ { "recipe_name" : "Chocolate Chip Cookies" , "rating" : "a+" }, { "recipe_name" : "Peanut Butter Cookies" , "rating" : "a" }, { "recipe_name" : "Oatmeal Raisin Cookies" , "rating" : "b" }, ... ] About JSON schemas Configuring the model for JSON output using responseSchema parameter relies on Schema object to define its structure. This object represents a select subset of the OpenAPI 3.0 Schema object , and also adds a propertyOrdering field. Tip: On Python, when you use a Pydantic model, you don't need to directly work with Schema objects, as it gets automatically converted to the corresponding JSON schema. To learn more, see JSON schemas in Python . Here's a pseudo-JSON representation of all the Schema fields: { "type": enum (Type), "format": string, "description": \ No newline at end of file diff --git a/docstore/034ae534-2b29-4bcf-9871-a9967f35e54c b/docstore/034ae534-2b29-4bcf-9871-a9967f35e54c new file mode 100644 index 0000000000000000000000000000000000000000..48ebc0d450e476e2d2310fffefae223b737ab72c --- /dev/null +++ b/docstore/034ae534-2b29-4bcf-9871-a9967f35e54c @@ -0,0 +1 @@ +Because standard Gemini API text and content generation calls are stateless, when using thinking in multi-turn interactions (such as chat), the model doesn't have access to thought context from previous turns. You can maintain thought context using thought signatures, which are encrypted representations of the model's internal thought process. The model returns thought signatures in the response object when thinking and function calling are enabled. To ensure the model maintains context across multiple turns of a conversation, you must provide the thought signatures back to the model in the subsequent requests. You will receive thought signatures when: Thinking is enabled and thoughts are generated. The request includes function declarations . Note: Thought signatures are only available when you're using function calling, specifically, your request must include function declarations . You can find an example of thinking with function calls on the Function calling page. Other usage limitations to consider with function calling include: Signatures are returned from the model within other parts in the response, for example function calling or text parts. Return the entire response with all parts back to the model in subsequent turns. Don't concatenate parts with signatures together. Don't merge one part with a signature with another part without a signature. Pricing Note: Summaries are available in the free and paid tiers of the API. Thought signatures will increase the input tokens you are charged when sent back as part of the request. When thinking is turned on, response pricing is the sum of output tokens and thinking tokens. You can get the total number of generated thinking tokens from the thoughtsTokenCount field. Python # ... print ( "Thoughts tokens:" , response . usage_metadata . thoughts_token_count ) print ( "Output tokens:" , response . usage_metadata . candidates_token_count ) JavaScript // ... console . log ( `Thoughts tokens: ${ response . usageMetadata \ No newline at end of file diff --git a/docstore/034fe1a9-bb88-4d03-99ec-18c5dd497026 b/docstore/034fe1a9-bb88-4d03-99ec-18c5dd497026 new file mode 100644 index 0000000000000000000000000000000000000000..c6d02be1542823e4201c0e0593de522656b4e223 --- /dev/null +++ b/docstore/034fe1a9-bb88-4d03-99ec-18c5dd497026 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/function-calling#thinking Title: Function calling with the Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/03624133-b3a3-4b5d-8265-6eaa5d022c8d b/docstore/03624133-b3a3-4b5d-8265-6eaa5d022c8d new file mode 100644 index 0000000000000000000000000000000000000000..7a617ceacc5e968d9729ffe6ff8f1e15b90d626d --- /dev/null +++ b/docstore/03624133-b3a3-4b5d-8265-6eaa5d022c8d @@ -0,0 +1 @@ +multiple attempts yield the best results. Keep it short : Limit text to 25 characters or less for optimal generation. Multiple phrases : Experiment with two or three distinct phrases to provide additional information. Avoid exceeding three phrases for cleaner compositions. Prompt: A poster with the text "Summerland" in bold font as a title, underneath this text is the slogan "Summer never felt so good" Guide Placement : While Imagen can attempt to position text as directed, expect occasional variations. This feature is continually improving. Inspire font style : Specify a general font style to subtly influence Imagen's choices. Don't rely on precise font replication, but expect creative interpretations. Font size : Specify a font size or a general indication of size (for example, small , medium , large ) to influence the font size generation. Prompt parameterization To better control output results, you might find it helpful to parameterize the inputs into Imagen. For example, suppose you want your customers to be able to generate logos for their business, and you want to make sure logos are always generated on a solid color background. You also want to limit the options that the client can select from a menu. In this example, you can create a parameterized prompt similar to the following: A {logo_style} logo for a {company_area} company on a solid color background. Include the text {company_name} . In your custom user interface, the customer can input the parameters using a menu, and their chosen value populates the prompt Imagen receives. For example: Prompt: A minimalist logo for a health care company on a solid color background. Include the text Journey . Prompt: A modern logo for a software company on a solid color background. Include the text Silo . Prompt: A traditional logo for a baking company on a solid color background. Include the text Seed . Advanced prompt writing techniques Use the following examples to create more specific prompts based on attributes \ No newline at end of file diff --git a/docstore/038870f2-74fc-4c32-bf18-2c151911b278 b/docstore/038870f2-74fc-4c32-bf18-2c151911b278 new file mode 100644 index 0000000000000000000000000000000000000000..a1daa64b7091fa9e5594d9222d90d8c8ad6521d2 --- /dev/null +++ b/docstore/038870f2-74fc-4c32-bf18-2c151911b278 @@ -0,0 +1 @@ +). You can listen to all the voices in AI Studio . To specify a voice, set the voice name within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "voice_config" : { "prebuilt_voice_config" : { "voice_name" : "Kore" }} }, } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" } } } }; Note: If you're using the generateContent API, the set of available voices is slightly different. See the audio generation guide for generateContent audio generation voices. The Live API supports multiple languages . To change the language, set the language code within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "language_code" : "de-DE" } } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { languageCode : "de-DE" } }; Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Native audio capabilities The following capabilities are only available with native audio. You can learn more about native audio in Choose a model and audio generation . Note: Native audio models currently have limited tool use support. See Overview of supported tools for details. How to use native audio output To use native audio output, configure one of the native audio models and set response_modalities to AUDIO . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-preview-native-audio-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio . live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-preview-native-audio-dialog' ; const config = { responseModalities : [ \ No newline at end of file diff --git a/docstore/038ee2b9-c022-4a33-b9b1-902387f03850 b/docstore/038ee2b9-c022-4a33-b9b1-902387f03850 new file mode 100644 index 0000000000000000000000000000000000000000..933572d214295ba3502f9da18dc8703a97691ed7 --- /dev/null +++ b/docstore/038ee2b9-c022-4a33-b9b1-902387f03850 @@ -0,0 +1 @@ +captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). * Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds, and their general characteristics. * Thesis Statement: State the main point of the essay, focusing on the remarkable adaptations, behaviors, and ecological significance that make hummingbirds extraordinary. ..... (gemini-2.5-flash) Zero-shot vs few-shot prompts You can include examples in the prompt that show the model what getting it right looks like. The model attempts to identify patterns and relationships from the examples and applies them when generating a response. Prompts that contain a few examples are called few-shot prompts, while prompts that provide no examples are called zero-shot prompts. Few-shot prompts are often used to regulate the formatting, phrasing, scoping, or general patterning of model responses. Use specific and varied examples to help the model narrow its focus and generate more accurate results. We recommend to always include few-shot examples in your prompts. Prompts without few-shot examples are likely to be less effective. In fact, you can remove instructions from your prompt if your examples are clear enough in showing the task at hand. The following zero-shot prompt asks the model to choose the best explanation. Prompt: Please choose the best explanation to the question: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Explanation1 is the better explanation because it provides more detail on the process, including how ice crystals combine and grow into snowflakes as they fall through the atmosphere. (gemini-2.5-flash) If your use case requires the \ No newline at end of file diff --git a/docstore/03b980b3-f308-4a16-b4f5-9ac0f080b2db b/docstore/03b980b3-f308-4a16-b4f5-9ac0f080b2db new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/03b980b3-f308-4a16-b4f5-9ac0f080b2db @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/03deda5c-2a1d-4d2b-99b0-7923855510fb b/docstore/03deda5c-2a1d-4d2b-99b0-7923855510fb new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/03deda5c-2a1d-4d2b-99b0-7923855510fb @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/03e696af-5d85-4432-873a-12bfe0f9303d b/docstore/03e696af-5d85-4432-873a-12bfe0f9303d new file mode 100644 index 0000000000000000000000000000000000000000..ca0d44e9961a5c85cb8aeb7b443a141c9784fb0d --- /dev/null +++ b/docstore/03e696af-5d85-4432-873a-12bfe0f9303d @@ -0,0 +1 @@ +Image understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image understanding Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models. Tip: In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer improved accuracy for specific use cases like object detection and segmentation , through additional training. See the Capabilities section for more details. Passing images to Gemini You can provide images as input to Gemini using two methods: Passing inline image data : Ideal for smaller files (total request size less than 20MB, including prompts). Uploading images using the File API : Recommended for larger files or for reusing images across multiple requests. Passing inline image data You can pass inline image data in the request to generateContent . You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language). The following example shows how to read an image from a local file and pass it to generateContent API for processing. Python from google.genai import types with open ( 'path/to/small-sample.jpg' , 'rb' ) as f : image_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ types . Part . from_bytes ( data = image_bytes , mime_type = 'image/jpeg' , ), 'Caption this image.' ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs \ No newline at end of file diff --git a/docstore/04054ae9-d1f7-459b-a5c0-770ca71456a7 b/docstore/04054ae9-d1f7-459b-a5c0-770ca71456a7 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/04054ae9-d1f7-459b-a5c0-770ca71456a7 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/0407b246-7b79-4f39-9294-6249d16d33d2 b/docstore/0407b246-7b79-4f39-9294-6249d16d33d2 new file mode 100644 index 0000000000000000000000000000000000000000..ba28dbc7ce7bec083cb930373e6fd594666aa933 --- /dev/null +++ b/docstore/0407b246-7b79-4f39-9294-6249d16d33d2 @@ -0,0 +1 @@ +my house?" , }); console . log ( "Chat response 2:" , response2 . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) res , _ := chat . SendMessage ( ctx , genai . Part { Text : "How many paws are in my house?" }) if len ( res . Candidates ) > 0 { fmt . Println ( res . Candidates [ 0 ]. Content . Parts [ 0 ]. Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' \ No newline at end of file diff --git a/docstore/04080b47-3180-450d-bc80-74dfdee79ad9 b/docstore/04080b47-3180-450d-bc80-74dfdee79ad9 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/04080b47-3180-450d-bc80-74dfdee79ad9 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/0427d1e6-3db7-499d-b2f0-476bb4fce4d5 b/docstore/0427d1e6-3db7-499d-b2f0-476bb4fce4d5 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/0427d1e6-3db7-499d-b2f0-476bb4fce4d5 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/04282b0c-0b4e-41a1-8bb2-64654a1f5afd b/docstore/04282b0c-0b4e-41a1-8bb2-64654a1f5afd new file mode 100644 index 0000000000000000000000000000000000000000..6d0fb3dda30537d366b10ce141412a20984aa796 --- /dev/null +++ b/docstore/04282b0c-0b4e-41a1-8bb2-64654a1f5afd @@ -0,0 +1 @@ +where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels. """ config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # set thinking_budget to 0 for better results in object detection ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ prompt , im ], # Pillow images can be directly passed as inputs (which will be converted by the SDK) config = config ) # Parse JSON response items = json . loads ( parse_json ( response . text )) # Create output directory os . makedirs ( output_dir , exist_ok = True ) # Process each mask for i , item in enumerate ( items ): # Get bounding box coordinates box = item [ "box_2d" ] y0 = int ( box [ 0 ] / 1000 * im . size [ 1 ]) x0 = int ( box [ 1 ] / 1000 * im . size [ 0 ]) y1 = int ( box [ 2 ] / 1000 * im . size [ 1 ]) x1 = int ( box [ 3 ] / 1000 * im . size [ 0 ]) # Skip invalid boxes if y0 > = y1 or x0 > = x1 : continue # Process mask png_str = item [ "mask" ] if not png_str . startswith ( "data:image/png;base64," ): continue # Remove prefix png_str = png_str . removeprefix ( "data:image/png;base64," ) mask_data = base64 . b64decode ( png_str ) mask = Image . open ( io . BytesIO ( mask_data )) # Resize mask to match bounding box mask = mask . resize (( x1 - x0 , y1 - y0 ), Image . Resampling . BILINEAR ) # Convert mask to numpy array for processing mask_array = np . array ( mask ) # Create overlay for this mask overlay = Image . new ( 'RGBA' , im . size , ( 0 , 0 , 0 , 0 )) overlay_draw = ImageDraw . Draw ( overlay ) # Create overlay for the mask color = ( 255 , 255 , 255 , 200 ) for y in range ( y0 , y1 ): for x in range ( x0 , x1 ): if mask_array [ y - y0 , x - x0 ] > 128 : # Threshold for mask overlay_draw . point (( x , y ), fill = color ) # Save individual mask and its overlay mask_filename = f " { item [ 'label' ] } _ { i } _mask.png" \ No newline at end of file diff --git a/docstore/042f97f2-97d2-449e-a851-a4d21550c56e b/docstore/042f97f2-97d2-449e-a851-a4d21550c56e new file mode 100644 index 0000000000000000000000000000000000000000..1b38806cd93f51329872622740eee5221d7a7d7e --- /dev/null +++ b/docstore/042f97f2-97d2-449e-a851-a4d21550c56e @@ -0,0 +1 @@ +18°C." , config = config , ) # Print the final, user-facing response print ( response . text ) Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . I 've set the thermostat to 20°C. JavaScript This example shows how to use JavaScript/TypeScript SDK to do comopositional function calling using a manual execution loop. import { GoogleGenAI , Type } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Example Functions function get_weather_forecast ({ location }) { console . log ( `Tool Call: get_weather_forecast(location= ${ location } )` ); // TODO: Make API call console . log ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ); return { temperature : 25 , unit : "celsius" }; } function set_thermostat_temperature ({ temperature }) { console . log ( `Tool Call: set_thermostat_temperature(temperature= ${ temperature } )` , ); // TODO: Make API call console . log ( "Tool Response: {'status': 'success'}" ); return { status : "success" }; } const toolFunctions = { get_weather_forecast , set_thermostat_temperature , }; const tools = [ { functionDeclarations : [ { name : "get_weather_forecast" , description : "Gets the current weather temperature for a given location." , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , }, }, required : [ "location" ], }, }, { name : "set_thermostat_temperature" , description : "Sets the thermostat to a desired temperature." , parameters : { type : Type . OBJECT , properties : { temperature : { type : Type . NUMBER , }, }, required : [ \ No newline at end of file diff --git a/docstore/044922bc-ec8b-447a-9f66-6f4ef43dec27 b/docstore/044922bc-ec8b-447a-9f66-6f4ef43dec27 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/044922bc-ec8b-447a-9f66-6f4ef43dec27 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/0479d6ca-b7f5-4412-b3cd-3fe4f92ed3d6 b/docstore/0479d6ca-b7f5-4412-b3cd-3fe4f92ed3d6 new file mode 100644 index 0000000000000000000000000000000000000000..1fd617a587d76016a0c4d5b56098be9076683928 --- /dev/null +++ b/docstore/0479d6ca-b7f5-4412-b3cd-3fe4f92ed3d6 @@ -0,0 +1 @@ +candidates [ 0 ] . content ) # Append the content from the model's response. contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Create a function response part const function_response_part = { name : tool_call . name , response : { result } } // Append function call and result of the function execution to contents contents . push ( response . candidates [ 0 ]. content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); // Get the final response from the model const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); This completes the function calling flow. The model successfully used the set_light_values function to perform the request action of the user. Function declarations When you implement function calling in a prompt, you create a tools object, which contains one or more function declarations . You define functions using JSON, specifically with a select subset of the OpenAPI schema format. A single function declaration can include the following parameters: name (string): A unique name for the function ( get_weather_forecast , send_email ). Use descriptive names without spaces or special characters (use underscores or camelCase). description (string): A clear and detailed explanation of the function's purpose and capabilities. This is crucial for the model to understand when to use the function. Be specific and provide examples if helpful ("Finds theaters based on location and optionally movie title which is currently playing in theaters."). parameters (object): Defines the input parameters the function expects. type (string): Specifies the overall data type, \ No newline at end of file diff --git a/docstore/04b11469-ce08-4a77-b176-29ad1d419785 b/docstore/04b11469-ce08-4a77-b176-29ad1d419785 new file mode 100644 index 0000000000000000000000000000000000000000..5517c0bf6b0252fb1080acbdd22b2b409d2663d8 --- /dev/null +++ b/docstore/04b11469-ce08-4a77-b176-29ad1d419785 @@ -0,0 +1 @@ +signatures, function call and result of the function execution to contents function_call_content = response . candidates [ 0 ] . content # Append the model's function call message, which includes thought signatures contents . append ( function_call_content ) contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Step 4: Create user friendly response with function result and call the model again // ...Create a function response part (No change) // Append thought signatures, function call and result of the function execution to contents const function_response_content = response . candidates [ 0 ]. content ; contents . push ( function_response_content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); The following shows what a request returning a thought signature may look like: [{ "contents" : [ { "role" : "user" , "parts" : [ { "text" : "what is the weather in Lake Tahoe?" } ] } , { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/04b72b80-8b77-498b-ab59-8d75c59f3ec5 b/docstore/04b72b80-8b77-498b-ab59-8d75c59f3ec5 new file mode 100644 index 0000000000000000000000000000000000000000..69f7399c35aaaad68e1bd1a996c44353577b3a79 --- /dev/null +++ b/docstore/04b72b80-8b77-498b-ab59-8d75c59f3ec5 @@ -0,0 +1 @@ +the user. if message . server_content and message . server_content . turn_complete : break if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } console . debug ( 'Connecting to the service with handle %s...' , previousSessionHandle ) const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : { responseModalities : [ Modality . TEXT ], sessionResumption : { handle : previousSessionHandle } // The handle of the session to resume is passed here, or else null to start a new session. } }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . sessionResumptionUpdate ) { if ( turn . sessionResumptionUpdate . resumable && turn . sessionResumptionUpdate . newHandle ) { let newHandle = turn . sessionResumptionUpdate . newHandle // ...Store newHandle and start new session with this handle here } } } session . close (); } async function main () { await \ No newline at end of file diff --git a/docstore/04b84550-7149-47c9-a927-6e708f6713d2 b/docstore/04b84550-7149-47c9-a927-6e708f6713d2 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/04b84550-7149-47c9-a927-6e708f6713d2 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/04bb7966-1739-415f-af01-3fb72f6bb0cb b/docstore/04bb7966-1739-415f-af01-3fb72f6bb0cb new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/04bb7966-1739-415f-af01-3fb72f6bb0cb @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/04be76af-5d3c-478c-bc70-667c142cca2f b/docstore/04be76af-5d3c-478c-bc70-667c142cca2f new file mode 100644 index 0000000000000000000000000000000000000000..56f7dcc9bceb6e3744499a44a4d77c57b76426e8 --- /dev/null +++ b/docstore/04be76af-5d3c-478c-bc70-667c142cca2f @@ -0,0 +1 @@ += "gemini-2.5-flash" , contents = [ "Explain how AI works" ], config = types . GenerateContentConfig ( temperature = 0.1 ) ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , config : { temperature : 0.1 , }, }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } temp := float32 ( 0.9 ) topP := float32 ( 0.5 ) topK := float32 ( 20.0 ) config := & genai . GenerateContentConfig { Temperature : & temp , TopP : & topP , TopK : & topK , ResponseMIMEType : "application/json" , } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "What is the average size of a swallow?" ), config , ) fmt . Println ( result . Text ()) } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ], "generationConfig": { "stopSequences": [ "Title" ], "temperature": 1.0, "topP": 0.8, "topK": 10 } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const generationConfig = { temperature : 1 , topP : 0.95 , topK : 40 , responseMimeType : 'text/plain' , }; const payload = { generationConfig , contents : [ { parts : [ { text : 'Explain how AI works in a few words' }, ], }, ], }; const url = \ No newline at end of file diff --git a/docstore/04d64ebc-9533-4738-8a22-a8699a5b2946 b/docstore/04d64ebc-9533-4738-8a22-a8699a5b2946 new file mode 100644 index 0000000000000000000000000000000000000000..805d10be613a3c55006267c8f5d9ce17fbca450d --- /dev/null +++ b/docstore/04d64ebc-9533-4738-8a22-a8699a5b2946 @@ -0,0 +1 @@ +declaration can include the following parameters: name (string): A unique name for the function ( get_weather_forecast , send_email ). Use descriptive names without spaces or special characters (use underscores or camelCase). description (string): A clear and detailed explanation of the function's purpose and capabilities. This is crucial for the model to understand when to use the function. Be specific and provide examples if helpful ("Finds theaters based on location and optionally movie title which is currently playing in theaters."). parameters (object): Defines the input parameters the function expects. type (string): Specifies the overall data type, such as object . properties (object): Lists individual parameters, each with: type (string): The data type of the parameter, such as string , integer , boolean, array . description (string): A description of the parameter's purpose and format. Provide examples and constraints ("The city and state, e.g., 'San Francisco, CA' or a zip code e.g., '95616'."). enum (array, optional): If the parameter values are from a fixed set, use "enum" to list the allowed values instead of just describing them in the description. This improves accuracy ("enum": ["daylight", "cool", "warm"]). required (array): An array of strings listing the parameter names that are mandatory for the function to operate. Function calling with thinking Enabling "thinking" can improve function call performance by allowing the model to reason through a request before suggesting function calls. However, because the Gemini API is stateless, this reasoning context is lost between turns, which can reduce the quality of function calls as they require multiple turn requests. To preserve this context you can use thought signatures. A thought signature is an encrypted representation of the model's internal thought process that you pass back to the model on subsequent turns. To use thought signatures: Receive the signature: When thinking is enabled, the API \ No newline at end of file diff --git a/docstore/0503e445-03c9-4a96-ab29-267953ac4973 b/docstore/0503e445-03c9-4a96-ab29-267953ac4973 new file mode 100644 index 0000000000000000000000000000000000000000..0a87c36dd70c54ff062f2d96388cf5344fb768a1 --- /dev/null +++ b/docstore/0503e445-03c9-4a96-ab29-267953ac4973 @@ -0,0 +1 @@ +process the response and check for Function Call, if Yes : Extract the name and args of the function and execute the corresponding function in your application. No: The model has provided a direct text response to the prompt (this flow is less emphasized in the example but is a possible outcome). Create User friendly response: If a function was executed, capture the result and send it back to the model in a subsequent turn of the conversation. It will use the result to generate a final, user-friendly response that incorporates the information from the function call. This process can be repeated over multiple turns, allowing for complex interactions and workflows. The model also supports calling multiple functions in a single turn ( parallel function calling ) and in sequence ( compositional function calling ). Step 1: Define a function declaration Define a function and its declaration within your application code that allows users to set light values and make an API request. This function could call external services or APIs. Python # Define a function that the model can call to control smart lights set_light_values_declaration = { "name" : "set_light_values" , "description" : "Sets the brightness and color temperature of a light." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "integer" , "description" : "Light level from 0 to 100. Zero is off and 100 is full brightness" , }, "color_temp" : { "type" : "string" , "enum" : [ "daylight" , "cool" , "warm" ], "description" : "Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`." , }, }, "required" : [ "brightness" , "color_temp" ], }, } # This is the actual function that would be called based on the model's suggestion def set_light_values ( brightness : int , color_temp : str ) - > dict [ str , int | str ]: """Set the brightness and color temperature of a room light. (mock API). Args: brightness: Light level from 0 to 100. Zero is off and 100 is full \ No newline at end of file diff --git a/docstore/05079668-2f49-4c9f-b3c9-f95617d00ff4 b/docstore/05079668-2f49-4c9f-b3c9-f95617d00ff4 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/05079668-2f49-4c9f-b3c9-f95617d00ff4 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/05164eb4-315d-4ca2-84d4-6355dd0d69ae b/docstore/05164eb4-315d-4ca2-84d4-6355dd0d69ae new file mode 100644 index 0000000000000000000000000000000000000000..cc967ffee72bfb803a27c22b45a7304a203928ac --- /dev/null +++ b/docstore/05164eb4-315d-4ca2-84d4-6355dd0d69ae @@ -0,0 +1 @@ +the model to parse the problem before answering the prompt. Prompt Response Parse the formula in the image first. Then based on the formula, answer "what's the 4th term in the sequence?". Click to expand the result $$ b(n) = b(n-1) * (-3) $$ $$ b(1) = 15 $$ Substitute n = 4 into the formula: $$ b(4) = b(4-1) * (-3) $$ $$ b(4) = b(3) * (-3) $$ We don't know the value of b(3), so substitute n = 3 into the formula: $$ b(3) = b(3-1) * (-3) $$ $$ b(3) = b(2) * (-3) $$ We don't know the value of b(2), so substitute n = 2 into the formula: $$ b(2) = b(2-1) * (-3) $$ $$ b(2) = b(1) * (-3) $$ We know the value of b(1) is 15, so substitute it into the formula: $$ b(2) = 15 * (-3) $$ $$ b(2) = -45 $$ Now that we know the value of b(2), substitute it into the formula for b(3): $$ b(3) = b(2) * (-3) $$ $$ b(3) = -45 * (-3) $$ $$ b(3) = 135 $$ Now that we know the value of b(3), substitute it into the formula for b(4): $$ b(4) = b(3) * (-3) $$ $$ b(4) = 135 * (-3) $$ $$ b(4) = -405 $$ Therefore, the 4th term in the sequence is -405. This model response is correct - now to make the response even more streamlined, you can ask the model to to think step-by-step. Updated prompt Improved response What is the 4th term in the sequence? Think step by step. The first term is 15. The second term is b(1) * (-3) = 15 * (-3) = -45. The third term is b(2) * (-3) = -45 * (-3) = 135. The fourth term is b(3) * (-3) = 135 * (-3) = -405. Specify the output format A common problem is the need for model output to be in a certain format (e.g. markdown, JSON, HTML), especially if the model output needs to be ingested by a downstream task. You can try to produce output in that format by instructing the model to do so within the prompt. Below are two examples: Prompt Model response Parse the table in this image into markdown format | Subject | Status | Time | Start date | Due on | | ------- | ------ | ---- | ---------- | ------ | | Math | Done | 1.5 hrs | 11/29/2023 | 12/1/2023 | | Geography | Done | 30 \ No newline at end of file diff --git a/docstore/05224225-6aae-41fe-a1e1-12e39d8c9cf2 b/docstore/05224225-6aae-41fe-a1e1-12e39d8c9cf2 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/05224225-6aae-41fe-a1e1-12e39d8c9cf2 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/05291d19-02a0-494d-9116-423f4df990f0 b/docstore/05291d19-02a0-494d-9116-423f4df990f0 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/05291d19-02a0-494d-9116-423f4df990f0 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/052dcd35-a07c-44e4-8d46-e5e7fb5a979f b/docstore/052dcd35-a07c-44e4-8d46-e5e7fb5a979f new file mode 100644 index 0000000000000000000000000000000000000000..93f56ebef9af2c2337fadabd0969094c5f9f9a6e --- /dev/null +++ b/docstore/052dcd35-a07c-44e4-8d46-e5e7fb5a979f @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting_with_media#troubleshooting Title: Files API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/0531a0cf-ecfb-4737-be01-ea672125dd3b b/docstore/0531a0cf-ecfb-4737-be01-ea672125dd3b new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/0531a0cf-ecfb-4737-be01-ea672125dd3b @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/053d5545-ecdf-43f9-96d7-2cf0e72d51f0 b/docstore/053d5545-ecdf-43f9-96d7-2cf0e72d51f0 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/053d5545-ecdf-43f9-96d7-2cf0e72d51f0 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/0564263f-60a7-450c-8352-eb0ed8e9cd45 b/docstore/0564263f-60a7-450c-8352-eb0ed8e9cd45 new file mode 100644 index 0000000000000000000000000000000000000000..a70be0d04647e3405f67cb87b3156d8922b7c210 --- /dev/null +++ b/docstore/0564263f-60a7-450c-8352-eb0ed8e9cd45 @@ -0,0 +1 @@ +to the examples provides labels that the model can use when generating the output, which makes it easier to parse output content. In the following example, "Text:" is the input prefix and "The answer is:" is the output prefix. Prompt: Classify the text as one of the following categories. - large - small Text: Rhino The answer is: large Text: Mouse The answer is: small Text: Snail The answer is: small Text: Elephant The answer is: Response: The answer is: large (gemini-2.5-flash) Break down prompts into components For use cases that require complex prompts, you can help the model manage this complexity by breaking things down into simpler components. Break down instructions: Instead of having many instructions in one prompt, create one prompt per instruction. You can choose which prompt to process based on the user's input. Chain prompts: For complex tasks that involve multiple sequential steps, make each step a prompt and chain the prompts together in a sequence. In this sequential chain of prompts, the output of one prompt in the sequence becomes the input of the next prompt. The output of the last prompt in the sequence is the final output. Aggregate responses: Aggregation is when you want to perform different parallel tasks on different portions of the data and aggregate the results to produce the final output. For example, you can tell the model to perform one operation on the first part of the data, perform another operation on the rest of the data and aggregate the results. Experiment with model parameters Each call that you send to a model includes parameter values that control how the model generates a response. The model can generate different results for different parameter values. Experiment with different parameter values to get the best values for the task. The parameters available for different models may differ. The most common parameters are the following: Max output tokens: Specifies the maximum number of tokens that can be generated in the \ No newline at end of file diff --git a/docstore/05b0cc5a-3afc-42e5-a764-838701d2d215 b/docstore/05b0cc5a-3afc-42e5-a764-838701d2d215 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/05b0cc5a-3afc-42e5-a764-838701d2d215 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/05c98955-bb17-4bec-a79f-857452e235fb b/docstore/05c98955-bb17-4bec-a79f-857452e235fb new file mode 100644 index 0000000000000000000000000000000000000000..b58c189db2b124c61872134bdd4b3d786a4e18e6 --- /dev/null +++ b/docstore/05c98955-bb17-4bec-a79f-857452e235fb @@ -0,0 +1 @@ +you can read about configuring function calling . Python from google import genai from google.genai import types # Configure the client and tools client = genai . Client () house_tools = [ types . Tool ( function_declarations = [ power_disco_ball , start_music , dim_lights ]) ] config = types . GenerateContentConfig ( tools = house_tools , automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ), # Force the model to call 'any' function, instead of chatting. tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = 'ANY' ) ), ) chat = client . chats . create ( model = "gemini-2.5-flash" , config = config ) response = chat . send_message ( "Turn this place into a party!" ) # Print out each of the function calls requested from this single call print ( "Example 1: Forced function calling" ) for fn in response . function_calls : args = ", " . join ( f " { key } = { val } " for key , val in fn . args . items ()) print ( f " { fn . name } ( { args } )" ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Set up function declarations const houseFns = [ powerDiscoBall , startMusic , dimLights ]; const config = { tools : [{ functionDeclarations : houseFns }], // Force the model to call 'any' function, instead of chatting. toolConfig : { functionCallingConfig : { mode : 'any' } } }; // Configure the client const ai = new GoogleGenAI ({}); // Create a chat session const chat = ai . chats . create ({ model : 'gemini-2.5-flash' , config : config }); const response = await chat . sendMessage ({ message : 'Turn this place into a party!' }); // Print out each of the function calls requested from this single call console . log ( "Example 1: Forced function calling" ); for ( const fn of response . functionCalls ) { const args = Object . entries ( fn . args ) . map (([ key , val ]) = > ` ${ key } = ${ val } ` ) . join ( ', ' ); console . log ( ` ${ fn . name } ( ${ args } )` ); } Each of the printed \ No newline at end of file diff --git a/docstore/05d04b84-4355-4f38-9eb2-f19723cc1afd b/docstore/05d04b84-4355-4f38-9eb2-f19723cc1afd new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/05d04b84-4355-4f38-9eb2-f19723cc1afd @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/0603ecbb-737d-4556-b1a9-fa792d85866e b/docstore/0603ecbb-737d-4556-b1a9-fa792d85866e new file mode 100644 index 0000000000000000000000000000000000000000..39af00a0c9a9d748f5b17ca6db1ebff5b823541e --- /dev/null +++ b/docstore/0603ecbb-737d-4556-b1a9-fa792d85866e @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " Delete uploaded files Files are automatically deleted after 48 hours. You can also manually delete an uploaded file: Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) client . files . delete ( name = myfile . name ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; await ai . files . delete ({ name : fileName }); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } client . DeleteFile ( ctx , file . Name ) REST curl --request "DELETE" https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " Usage info You can use the Files API to upload and interact with media files. The Files API lets you store up to 20 GB of files per project, with a per-file maximum size of 2 GB. Files are stored for 48 hours. During that time, you can use the API to get metadata about the files, but you can't download the files. The Files API is available at no cost in all regions where the Gemini API is available. File prompting strategies This section provides guidance and best practices for using media files with prompts for the Gemini API. Being able to use various types of data in your prompts gives you more flexibility in terms of what tasks you can tackle with the Gemini API. For example, you can send the model a photo of a delicious meal and ask it to write a short blog about the meal. Prompt Response Write a short, engaging blog post based on this picture. It should include a description of the meal in the photo and talk about my journey meal prepping. Meal prepping is a great way to save time and money, and it can also help you to eat healthier. This meal is a great example of a healthy and delicious meal that can be easily prepped ahead of time. This \ No newline at end of file diff --git a/docstore/060452e9-d152-4249-a469-90135c953ed6 b/docstore/060452e9-d152-4249-a469-90135c953ed6 new file mode 100644 index 0000000000000000000000000000000000000000..96b994998643d2df29eb74708bc512bc02b7e41e --- /dev/null +++ b/docstore/060452e9-d152-4249-a469-90135c953ed6 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/code-execution#supported-libraries Title: Code execution | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/060de578-1772-4186-8d59-455f49bc36ac b/docstore/060de578-1772-4186-8d59-455f49bc36ac new file mode 100644 index 0000000000000000000000000000000000000000..98b394b58a10b21e5db5c0db2762b1faad87315e --- /dev/null +++ b/docstore/060de578-1772-4186-8d59-455f49bc36ac @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.5-flash-native-audio Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/062a3862-afae-4656-b135-605d502dfa4f b/docstore/062a3862-afae-4656-b135-605d502dfa4f new file mode 100644 index 0000000000000000000000000000000000000000..f1254740842b565388da4397f822e545092a5b88 --- /dev/null +++ b/docstore/062a3862-afae-4656-b135-605d502dfa4f @@ -0,0 +1 @@ +NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) stream := chat . SendMessageStream ( ctx , genai . Part { Text : "How many paws are in my house?" }) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt = sse \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Supported models All models in the Gemini family support text generation. To learn more about the models \ No newline at end of file diff --git a/docstore/06340277-c0b0-41d2-b7fc-d6e290d00255 b/docstore/06340277-c0b0-41d2-b7fc-d6e290d00255 new file mode 100644 index 0000000000000000000000000000000000000000..6479a4b50897c899a1b9742e0d69348c2776f1d5 --- /dev/null +++ b/docstore/06340277-c0b0-41d2-b7fc-d6e290d00255 @@ -0,0 +1 @@ +config = types . GenerateImagesConfig ( aspect_ratio = "16:9" , number_of_images = 1 ) ) imagen . generated_images [ 0 ] . image JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const response = await ai . models . generateImages ({ model : "imagen-3.0-generate-002" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , config : { numberOfImages : 1 , }, }); // you'll pass response.generatedImages[0].image.imageBytes to Veo Go package main import ( "context" "fmt" "os" "time" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateImagesConfig { AspectRatio : "16:9" , NumberOfImages : 1 , } response , _ := client . Models . GenerateImages ( ctx , "imagen-3.0-generate-002" , "Panning wide shot of a calico kitten sleeping in the sunshine" , config , ) // you'll pass response.GeneratedImages[0].Image to Veo } Then, generate a video using the resulting image as the first frame: Python operation = client . models . generate_videos ( model = "veo-2.0-generate-001" , prompt = prompt , image = imagen . generated_images [ 0 ] . image , config = types . GenerateVideosConfig ( person_generation = "dont_allow" , # "dont_allow" or "allow_adult" aspect_ratio = "16:9" , # "16:9" or "9:16" number_of_videos = 2 ), ) # Wait for videos to generate while not operation . done : time . sleep ( 20 ) operation = client . operations . get ( operation ) for n , video in enumerate ( operation . response . generated_videos ): fname = f 'with_image_input { n } .mp4' print ( fname ) client . files . download ( file = video . video ) video . video . save ( fname ) JavaScript import { GoogleGenAI } from "@google/genai" ; import { createWriteStream } from "fs" ; import { Readable } from "stream" ; const ai = new GoogleGenAI ({}); async function main () { // get image bytes from Imagen, as shown above let \ No newline at end of file diff --git a/docstore/06507db0-220d-4ccc-bb2c-09306d85cdb7 b/docstore/06507db0-220d-4ccc-bb2c-09306d85cdb7 new file mode 100644 index 0000000000000000000000000000000000000000..0a87c36dd70c54ff062f2d96388cf5344fb768a1 --- /dev/null +++ b/docstore/06507db0-220d-4ccc-bb2c-09306d85cdb7 @@ -0,0 +1 @@ +process the response and check for Function Call, if Yes : Extract the name and args of the function and execute the corresponding function in your application. No: The model has provided a direct text response to the prompt (this flow is less emphasized in the example but is a possible outcome). Create User friendly response: If a function was executed, capture the result and send it back to the model in a subsequent turn of the conversation. It will use the result to generate a final, user-friendly response that incorporates the information from the function call. This process can be repeated over multiple turns, allowing for complex interactions and workflows. The model also supports calling multiple functions in a single turn ( parallel function calling ) and in sequence ( compositional function calling ). Step 1: Define a function declaration Define a function and its declaration within your application code that allows users to set light values and make an API request. This function could call external services or APIs. Python # Define a function that the model can call to control smart lights set_light_values_declaration = { "name" : "set_light_values" , "description" : "Sets the brightness and color temperature of a light." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "integer" , "description" : "Light level from 0 to 100. Zero is off and 100 is full brightness" , }, "color_temp" : { "type" : "string" , "enum" : [ "daylight" , "cool" , "warm" ], "description" : "Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`." , }, }, "required" : [ "brightness" , "color_temp" ], }, } # This is the actual function that would be called based on the model's suggestion def set_light_values ( brightness : int , color_temp : str ) - > dict [ str , int | str ]: """Set the brightness and color temperature of a room light. (mock API). Args: brightness: Light level from 0 to 100. Zero is off and 100 is full \ No newline at end of file diff --git a/docstore/0677b0c8-92ff-40cb-ab10-4acd7f088734 b/docstore/0677b0c8-92ff-40cb-ab10-4acd7f088734 new file mode 100644 index 0000000000000000000000000000000000000000..d464a7e5141c7bcc5fa86ba919979db27614ba5c --- /dev/null +++ b/docstore/0677b0c8-92ff-40cb-ab10-4acd7f088734 @@ -0,0 +1 @@ +Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/06792137-4071-4b1e-95de-2a359a43f197 b/docstore/06792137-4071-4b1e-95de-2a359a43f197 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/06792137-4071-4b1e-95de-2a359a43f197 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/067c9501-3e3e-4a25-89ad-ab9ab8109705 b/docstore/067c9501-3e3e-4a25-89ad-ab9ab8109705 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/067c9501-3e3e-4a25-89ad-ab9ab8109705 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/06822ade-be83-4ce0-81a0-edd9fb6b5343 b/docstore/06822ade-be83-4ce0-81a0-edd9fb6b5343 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/06822ade-be83-4ce0-81a0-edd9fb6b5343 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/069976a4-f72f-4498-97f6-82380fca8d37 b/docstore/069976a4-f72f-4498-97f6-82380fca8d37 new file mode 100644 index 0000000000000000000000000000000000000000..68dfcf53eb693dba8358b7fdf6b0010fadcbc966 --- /dev/null +++ b/docstore/069976a4-f72f-4498-97f6-82380fca8d37 @@ -0,0 +1 @@ +. files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const countTokensResponse = await ai . models . countTokens ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), ]), }); console . log ( countTokensResponse . totalTokens ); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } localAudioPath := "/path/to/sample.mp3" uploadedFile , _ := client . Files . UploadFromPath ( ctx , localAudioPath , nil , ) parts := [] * genai . Part { genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } tokens , _ := client . Models . CountTokens ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Printf ( "File %s is %d tokens\n" , localAudioPath , tokens . TotalTokens ) } Supported audio formats Gemini supports the following audio format MIME types: WAV - audio/wav MP3 - audio/mp3 AIFF - audio/aiff AAC - audio/aac OGG Vorbis - audio/ogg FLAC - audio/flac Technical details about audio Gemini represents each second of audio as 32 tokens; for example, one minute of audio is represented as 1,920 tokens. Gemini can "understand" non-speech components, such as birdsong or sirens. The maximum supported length of audio data in a single prompt is 9.5 hours. Gemini doesn't limit the number of audio files in a single prompt; however, the total combined length of all audio files in a single prompt can't exceed 9.5 hours. Gemini downsamples audio files to a 16 Kbps data resolution. If the audio source contains multiple channels, Gemini combines those channels into a single channel. What's next This guide shows how to generate text in response to audio data. To learn more, see the following resources: File prompting strategies : \ No newline at end of file diff --git a/docstore/06af393a-1dc2-4a92-ac48-6c83e23bb1d8 b/docstore/06af393a-1dc2-4a92-ac48-6c83e23bb1d8 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/06af393a-1dc2-4a92-ac48-6c83e23bb1d8 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/0709683a-eea4-470b-a555-eaa9e8cb93c4 b/docstore/0709683a-eea4-470b-a555-eaa9e8cb93c4 new file mode 100644 index 0000000000000000000000000000000000000000..ba28dbc7ce7bec083cb930373e6fd594666aa933 --- /dev/null +++ b/docstore/0709683a-eea4-470b-a555-eaa9e8cb93c4 @@ -0,0 +1 @@ +my house?" , }); console . log ( "Chat response 2:" , response2 . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) res , _ := chat . SendMessage ( ctx , genai . Part { Text : "How many paws are in my house?" }) if len ( res . Candidates ) > 0 { fmt . Println ( res . Candidates [ 0 ]. Content . Parts [ 0 ]. Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' \ No newline at end of file diff --git a/docstore/071e2bf3-bd4c-4870-b369-b63464b6a91a b/docstore/071e2bf3-bd4c-4870-b369-b63464b6a91a new file mode 100644 index 0000000000000000000000000000000000000000..1983a1b7b4b0634f95c028654d1fae0a75b50e6a --- /dev/null +++ b/docstore/071e2bf3-bd4c-4870-b369-b63464b6a91a @@ -0,0 +1 @@ +{ mode : 'any' } } }; // Configure the client const ai = new GoogleGenAI ({}); // Create a chat session const chat = ai . chats . create ({ model : 'gemini-2.5-flash' , config : config }); const response = await chat . sendMessage ({ message : 'Turn this place into a party!' }); // Print out each of the function calls requested from this single call console . log ( "Example 1: Forced function calling" ); for ( const fn of response . functionCalls ) { const args = Object . entries ( fn . args ) . map (([ key , val ]) = > ` ${ key } = ${ val } ` ) . join ( ', ' ); console . log ( ` ${ fn . name } ( ${ args } )` ); } Each of the printed results reflects a single function call that the model has requested. To send the results back, include the responses in the same order as they were requested. The Python SDK supports automatic function calling , which automatically converts Python functions to declarations, handles the function call execution and response cycle for you. Following is an example for the disco use case. Note: Automatic Function Calling is a Python SDK only feature at the moment. Python from google import genai from google.genai import types # Actual function implementations def power_disco_ball_impl ( power : bool ) - > dict : """Powers the spinning disco ball. Args: power: Whether to turn the disco ball on or off. Returns: A status dictionary indicating the current state. """ return { "status" : f "Disco ball powered { 'on' if power else 'off' } " } def start_music_impl ( energetic : bool , loud : bool ) - > dict : """Play some music matching the specified parameters. Args: energetic: Whether the music is energetic or not. loud: Whether the music is loud or not. Returns: A dictionary containing the music settings. """ music_type = "energetic" if energetic else "chill" volume = "loud" if loud else "quiet" return { "music_type" : music_type , "volume" : volume } def dim_lights_impl ( brightness : float ) - > dict : """Dim the lights. Args: brightness: The \ No newline at end of file diff --git a/docstore/0729bde6-fc7b-4709-b0f0-2473c41b40df b/docstore/0729bde6-fc7b-4709-b0f0-2473c41b40df new file mode 100644 index 0000000000000000000000000000000000000000..2cf481a1418dcbba70e5c6548bd9873482649a44 --- /dev/null +++ b/docstore/0729bde6-fc7b-4709-b0f0-2473c41b40df @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/sdks#main-content Title: Gemini API libraries | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/0771c676-c8b7-40be-b333-05643b9b821c b/docstore/0771c676-c8b7-40be-b333-05643b9b821c new file mode 100644 index 0000000000000000000000000000000000000000..b56dfbca2298dddb94eb66ba2a6b5e2a4d7109fa --- /dev/null +++ b/docstore/0771c676-c8b7-40be-b333-05643b9b821c @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "' " ${ MIME1_TYPE } " '", "file_uri": ' $file1_uri '}}, { "inline_data": { "mime_type":"' " ${ MIME2_TYPE } " '", "data": "' " $IMAGE2_BASE64 " '" } } ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Object detection From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. Python from google import genai from google.genai import types from PIL import Image import json client = genai . Client () prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image . open ( "/path/to/image.png" ) config = types . GenerateContentConfig ( response_mime_type = "application/json" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , prompt ], config = config ) width , height = image . size bounding_boxes = json . loads ( response . text ) converted_bounding_boxes = [] for bounding_box in bounding_boxes : abs_y1 = int ( bounding_box [ "box_2d" ][ 0 ] / 1000 * height ) abs_x1 = int ( bounding_box [ "box_2d" ][ 1 ] / 1000 * width ) abs_y2 = int ( bounding_box [ "box_2d" ][ 2 ] / 1000 * height ) abs_x2 = int ( bounding_box [ "box_2d" ][ 3 ] / 1000 * width ) converted_bounding_boxes . append ([ abs_x1 , abs_y1 , abs_x2 , abs_y2 ]) print ( "Image size: " , width , height ) print ( "Bounding boxes:" , converted_bounding_boxes ) Note: The model also supports generating bounding boxes based on custom \ No newline at end of file diff --git a/docstore/0776594d-77fe-4f4f-b650-c316c761bcef b/docstore/0776594d-77fe-4f4f-b650-c316c761bcef new file mode 100644 index 0000000000000000000000000000000000000000..48ce7760ed3b3e078bbb96293e0e67132c5a10c7 --- /dev/null +++ b/docstore/0776594d-77fe-4f4f-b650-c316c761bcef @@ -0,0 +1 @@ +Video understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Video understanding Gemini models can process videos, enabling many frontier developer use cases that would have historically required domain specific models. Some of Gemini's vision capabilities include the ability to: Describe, segment, and extract information from videos Answer questions about video content Refer to specific timestamps within a video Gemini was built to be multimodal from the ground up and we continue to push the frontier of what is possible. This guide shows how to use the Gemini API to generate text responses based on video inputs. Video input You can provide videos as input to Gemini in the following ways: Upload a video file using the File API before making a request to generateContent . Use this method for files larger than 20MB, videos longer than approximately 1 minute, or when you want to reuse the file across multiple requests. Pass inline video data with the request to generateContent . Use this method for smaller files (<20MB) and shorter durations. Include a YouTube URL directly in the prompt. Upload a video file You can use the Files API to upload a video file. Always use the Files API when the total request size (including the file, text prompt, system instructions, etc.) is larger than 20 MB, the video duration is significant, or if you intend to use the same video in multiple prompts. The File API accepts video file formats directly. This example uses the short NASA film "Jupiter's Great Red Spot Shrinks and Grows" . Credit: Goddard Space Flight Center (GSFC)/David Ladd (2018). "Jupiter's Great Red Spot Shrinks and Grows" is in the \ No newline at end of file diff --git a/docstore/078d197e-b13a-47a7-b991-5f6ae8f5b1b7 b/docstore/078d197e-b13a-47a7-b991-5f6ae8f5b1b7 new file mode 100644 index 0000000000000000000000000000000000000000..5a2b8a2c7253b87796a50aae4616794fa40b7018 --- /dev/null +++ b/docstore/078d197e-b13a-47a7-b991-5f6ae8f5b1b7 @@ -0,0 +1 @@ +( 'Opened' ); }, onmessage : function ( message ) { console . debug ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send content... session . close (); } main (); Note: You can only set one modality in the response_modalities field. This means that you can configure the model to respond with either text or audio, but not both in the same session. Interaction modalities The following sections provide examples and supporting context for the different input and output modalities available in Live API. Sending and receiving text Here's how you can send and receive text: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello, how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . text is not None : print ( response . text , end = "" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . \ No newline at end of file diff --git a/docstore/07bc0279-bb3f-46d1-885b-40b91bf1cfc5 b/docstore/07bc0279-bb3f-46d1-885b-40b91bf1cfc5 new file mode 100644 index 0000000000000000000000000000000000000000..4ce25c45c0956235a2a76b8ce578fbaaad6010c8 --- /dev/null +++ b/docstore/07bc0279-bb3f-46d1-885b-40b91bf1cfc5 @@ -0,0 +1 @@ +clip" , ]), }); console . log ( response . text ); } await main (); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } defer client . DeleteFile ( ctx , file . Name ) model := client . GenerativeModel ( "gemini-2.0-flash" ) resp , err := model . GenerateContent ( ctx , genai . FileData { URI : file . URI }, genai . Text ( "Describe this audio clip" )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) REST AUDIO_PATH = "path/to/sample.mp3" MIME_TYPE = $( file -b --mime-type " ${ AUDIO_PATH } " ) NUM_BYTES = $( wc -c < " ${ AUDIO_PATH } " ) DISPLAY_NAME = AUDIO tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D " ${ tmp_header_file } " \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ AUDIO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Describe this audio clip"}, {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": \ No newline at end of file diff --git a/docstore/0805f694-a2f8-4f35-83fe-d9e809c0625e b/docstore/0805f694-a2f8-4f35-83fe-d9e809c0625e new file mode 100644 index 0000000000000000000000000000000000000000..13654164ec6e7048af241ec2efbc514257c312b5 --- /dev/null +++ b/docstore/0805f694-a2f8-4f35-83fe-d9e809c0625e @@ -0,0 +1 @@ +For alternative methods of providing images and more advanced image processing, see our image understanding guide . The API also supports document , video , and audio inputs and understanding. Streaming responses By default, the model returns a response only after the entire generation process is complete. For more fluid interactions, use streaming to receive GenerateContentResponse instances incrementally as they're generated. Python from google import genai client = genai . Client () response = client . models . generate_content_stream ( model = "gemini-2.5-flash" , contents = [ "Explain how AI works" ] ) for chunk in response : print ( chunk . text , end = "" ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , }); for await ( const chunk of response ) { console . log ( chunk . text ); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } stream := client . Models . GenerateContentStream ( ctx , "gemini-2.5-flash" , genai . Text ( "Write a story about a magic backpack." ), nil , ) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt=sse" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ --no-buffer \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main \ No newline at end of file diff --git a/docstore/081f67d8-dd19-4945-b103-0f006ac8d631 b/docstore/081f67d8-dd19-4945-b103-0f006ac8d631 new file mode 100644 index 0000000000000000000000000000000000000000..deed43be9d78353ae146822eb2d40897035c76a7 --- /dev/null +++ b/docstore/081f67d8-dd19-4945-b103-0f006ac8d631 @@ -0,0 +1 @@ +"What other color sofas would work in my space? can you update the image?" Multi-turn image editing (chat): Keep generating / editing images conversationally. Example prompts: [upload an image of a blue car.] , "Turn this car into a convertible.", "Now change the color to yellow." Limitations For best performance, use the following languages: EN, es-MX, ja-JP, zh-CN, hi-IN. Image generation does not support audio or video inputs. Image generation may not always trigger: The model may output text only. Try asking for image outputs explicitly (e.g. "generate an image", "provide images as you go along", "update the image"). The model may stop generating partway through. Try again or try a different prompt. When generating text for an image, Gemini works best if you first generate the text and then ask for an image with the text. There are some regions/countries where Image generation is not available. See Models for more information. Generate images using the Imagen models This example demonstrates generating images with an Imagen model : Python from google import genai from google.genai import types from PIL import Image from io import BytesIO client = genai . Client () response = client . models . generate_images ( model = 'imagen-4.0-generate-preview-06-06' , prompt = 'Robot holding a red skateboard' , config = types . GenerateImagesConfig ( number_of_images = 4 , ) ) for generated_image in response . generated_images : generated_image . image . show () JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . generateImages ({ model : 'imagen-4.0-generate-preview-06-06' , prompt : 'Robot holding a red skateboard' , config : { numberOfImages : 4 , }, }); let idx = 1 ; for ( const generatedImage of response . generatedImages ) { let imgBytes = generatedImage . image . imageBytes ; const buffer = Buffer . from ( imgBytes , "base64" ); fs . \ No newline at end of file diff --git a/docstore/0858294b-bfc9-4b56-b27d-63d69f4623e6 b/docstore/0858294b-bfc9-4b56-b27d-63d69f4623e6 new file mode 100644 index 0000000000000000000000000000000000000000..7ad07eb45fff1ffd88928a8c1191c40c43412859 --- /dev/null +++ b/docstore/0858294b-bfc9-4b56-b27d-63d69f4623e6 @@ -0,0 +1 @@ +public domain and does not show identifiable people. ( NASA image and media usage guidelines. ) The following code downloads the sample video, uploads it using the File API, waits for it to be processed, and then uses the file reference in a generateContent request. Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp4" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ myfile , "Summarize this video. Then create a quiz with an answer key based on the information in this video." ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp4" , config : { mimeType : "video/mp4" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Summarize this video. Then create a quiz with an answer key based on the information in this video." , ]), }); console . log ( response . text ); } await main (); Go uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.mp4" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Summarize this video. Then create a quiz with an answer key based on the information in this video." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST VIDEO_PATH = "path/to/sample.mp4" MIME_TYPE = $( file -b --mime-type " ${ VIDEO_PATH } " ) NUM_BYTES = $( wc -c < " ${ VIDEO_PATH } " ) DISPLAY_NAME = VIDEO tmp_header_file = upload-header.tmp echo "Starting file \ No newline at end of file diff --git a/docstore/08612474-98dc-4e4a-aa8b-acf5af374761 b/docstore/08612474-98dc-4e4a-aa8b-acf5af374761 new file mode 100644 index 0000000000000000000000000000000000000000..90311e83465fc930174e1787a7822757d628fc0a --- /dev/null +++ b/docstore/08612474-98dc-4e4a-aa8b-acf5af374761 @@ -0,0 +1 @@ +setLightValues ( tool_call . args . brightness , tool_call . args . color_temp ); console . log ( `Function execution result: ${ JSON . stringify ( result ) } ` ); } Step 4: Create user friendly response with function result and call the model again Finally, send the result of the function execution back to the model so it can incorporate this information into its final response to the user. Python # Create a function response part function_response_part = types . Part . from_function_response ( name = tool_call . name , response = { "result" : result }, ) # Append function call and result of the function execution to contents contents . append ( response . candidates [ 0 ] . content ) # Append the content from the model's response. contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Create a function response part const function_response_part = { name : tool_call . name , response : { result } } // Append function call and result of the function execution to contents contents . push ( response . candidates [ 0 ]. content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); // Get the final response from the model const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); This completes the function calling flow. The model successfully used the set_light_values function to perform the request action of the user. Function declarations When you implement function calling in a prompt, you create a tools object, which contains one or more function declarations . You define functions using JSON, specifically with a select subset of the OpenAPI schema format. A single function \ No newline at end of file diff --git a/docstore/0869bc16-c9d9-444b-b58d-c0a6c9796759 b/docstore/0869bc16-c9d9-444b-b58d-c0a6c9796759 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/0869bc16-c9d9-444b-b58d-c0a6c9796759 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/08777103-03dc-4e2c-9171-2e960166ac11 b/docstore/08777103-03dc-4e2c-9171-2e960166ac11 new file mode 100644 index 0000000000000000000000000000000000000000..facd4f50718f089582ed06a7b9e2ea144ce56d9c --- /dev/null +++ b/docstore/08777103-03dc-4e2c-9171-2e960166ac11 @@ -0,0 +1 @@ +instructions, such as: "Show bounding boxes of all green objects in this image". It also support custom labels like "label the items with the allergens they can contain". For more examples, check following notebooks in the Gemini Cookbook : 2D spatial understanding notebook Experimental 3D pointing notebook Segmentation Starting with Gemini 2.5, models not only detect items but also segment them and provide their contour masks. The model predicts a JSON list, where each item represents a segmentation mask. Each item has a bounding box (" box_2d ") in the format [y0, x0, y1, x1] with normalized coordinates between 0 and 1000, a label (" label ") that identifies the object, and finally the segmentation mask inside the bounding box, as base64 encoded png that is a probability map with values between 0 and 255. The mask needs to be resized to match the bounding box dimensions, then binarized at your confidence threshold (127 for the midpoint). Note: For better results, disable thinking by setting the thinking budget to 0. See code sample below for an example. Python from google import genai from google.genai import types from PIL import Image , ImageDraw import io import base64 import json import numpy as np import os client = genai . Client () def parse_json ( json_output : str ): # Parsing out the markdown fencing lines = json_output . splitlines () for i , line in enumerate ( lines ): if line == "```json" : json_output = " \n " . join ( lines [ i + 1 :]) # Remove everything before "```json" output = json_output . split ( "```" )[ 0 ] # Remove everything after the closing "```" break # Exit the loop once "```json" is found return json_output def extract_segmentation_masks ( image_path : str , output_dir : str = "segmentation_outputs" ): # Load and resize image im = Image . open ( image_path ) im . thumbnail ([ 1024 , 1024 ], Image . Resampling . LANCZOS ) prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks \ No newline at end of file diff --git a/docstore/088294c2-398a-41ca-b684-270e02195179 b/docstore/088294c2-398a-41ca-b684-270e02195179 new file mode 100644 index 0000000000000000000000000000000000000000..0d39ec227c9e73c3d3b5117917b80f593b500b4b --- /dev/null +++ b/docstore/088294c2-398a-41ca-b684-270e02195179 @@ -0,0 +1 @@ +async function main () { const ai = new GoogleGenAI ({}); const imageUrl = "https://goo.gle/instrument-img" ; const response = await fetch ( imageUrl ); const imageArrayBuffer = await response . arrayBuffer (); const base64ImageData = Buffer . from ( imageArrayBuffer ). toString ( 'base64' ); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ { inlineData : { mimeType : 'image/jpeg' , data : base64ImageData , }, }, { text : "Caption this image." } ], }); console . log ( result . text ); } main (); Go package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Download the image. imageResp , _ := http . Get ( "https://goo.gle/instrument-img" ) imageBytes , _ := io . ReadAll ( imageResp . Body ) parts := [] * genai . Part { genai . NewPartFromBytes ( imageBytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMG_URL = "https://goo.gle/instrument-img" MIME_TYPE = $( curl -sIL " $IMG_URL " | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1 ) if [[ -z " $MIME_TYPE " || ! " $MIME_TYPE " == image/* ]] ; then MIME_TYPE = "image/jpeg" fi # Check for macOS if [[ " $( uname ) " == "Darwin" ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -b 0 ) elif [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 ) else IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -w0 ) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ \ No newline at end of file diff --git a/docstore/08863662-b342-46c7-a302-64825014e0fd b/docstore/08863662-b342-46c7-a302-64825014e0fd new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/08863662-b342-46c7-a302-64825014e0fd @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/0895b2dd-bf47-4cf4-9ba8-7f76ddc32ddd b/docstore/0895b2dd-bf47-4cf4-9ba8-7f76ddc32ddd new file mode 100644 index 0000000000000000000000000000000000000000..2ca4f6201f11f85c04ee3e953ef8c501864496b7 --- /dev/null +++ b/docstore/0895b2dd-bf47-4cf4-9ba8-7f76ddc32ddd @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/grounding#main-content Title: Grounding with Google Search | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/08d426ac-6d33-4568-8d33-e9e21518b006 b/docstore/08d426ac-6d33-4568-8d33-e9e21518b006 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/08d426ac-6d33-4568-8d33-e9e21518b006 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/08e4695c-bf59-4684-8ea0-853b48b4c246 b/docstore/08e4695c-bf59-4684-8ea0-853b48b4c246 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/08e4695c-bf59-4684-8ea0-853b48b4c246 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/08f245cd-da17-40d6-98d8-70bb942ac256 b/docstore/08f245cd-da17-40d6-98d8-70bb942ac256 new file mode 100644 index 0000000000000000000000000000000000000000..cb7319bb995c708a315638a1177ba448f5c39210 --- /dev/null +++ b/docstore/08f245cd-da17-40d6-98d8-70bb942ac256 @@ -0,0 +1 @@ +: "Powers the spinning disco ball." , "parameters" : { "type" : "object" , "properties" : { "power" : { "type" : "boolean" , "description" : "Whether to turn the disco ball on or off." , } }, "required" : [ "power" ], }, } start_music = { "name" : "start_music" , "description" : "Play some music matching the specified parameters." , "parameters" : { "type" : "object" , "properties" : { "energetic" : { "type" : "boolean" , "description" : "Whether the music is energetic or not." , }, "loud" : { "type" : "boolean" , "description" : "Whether the music is loud or not." , }, }, "required" : [ "energetic" , "loud" ], }, } dim_lights = { "name" : "dim_lights" , "description" : "Dim the lights." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "number" , "description" : "The brightness of the lights, 0.0 is off, 1.0 is full." , } }, "required" : [ "brightness" ], }, } JavaScript import { Type } from '@google/genai' ; const powerDiscoBall = { name : 'power_disco_ball' , description : 'Powers the spinning disco ball.' , parameters : { type : Type . OBJECT , properties : { power : { type : Type . BOOLEAN , description : 'Whether to turn the disco ball on or off.' } }, required : [ 'power' ] } }; const startMusic = { name : 'start_music' , description : 'Play some music matching the specified parameters.' , parameters : { type : Type . OBJECT , properties : { energetic : { type : Type . BOOLEAN , description : 'Whether the music is energetic or not.' }, loud : { type : Type . BOOLEAN , description : 'Whether the music is loud or not.' } }, required : [ 'energetic' , 'loud' ] } }; const dimLights = { name : 'dim_lights' , description : 'Dim the lights.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'The brightness of the lights, 0.0 is off, 1.0 is full.' } }, required : [ 'brightness' ] } }; Configure the function calling mode to allow using all of the specified tools. To learn more, \ No newline at end of file diff --git a/docstore/09191eee-fbcf-4b5b-bc1e-1038e385ded8 b/docstore/09191eee-fbcf-4b5b-bc1e-1038e385ded8 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/09191eee-fbcf-4b5b-bc1e-1038e385ded8 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/09194a4b-18b1-4009-a50a-e52df3ce48d5 b/docstore/09194a4b-18b1-4009-a50a-e52df3ce48d5 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/09194a4b-18b1-4009-a50a-e52df3ce48d5 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/093231b5-00fe-4f50-b6b9-027f219a0475 b/docstore/093231b5-00fe-4f50-b6b9-027f219a0475 new file mode 100644 index 0000000000000000000000000000000000000000..64b38d40afbaa776eeced04508049a0f469e337d --- /dev/null +++ b/docstore/093231b5-00fe-4f50-b6b9-027f219a0475 @@ -0,0 +1 @@ +OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function main () { const messages = [{ "role" : "user" , "content" : "What's the weather like in Chicago today?" }]; const tools = [ { "type" : "function" , "function" : { "name" : "get_weather" , "description" : "Get the weather in a given location" , "parameters" : { "type" : "object" , "properties" : { "location" : { "type" : "string" , "description" : "The city and state, e.g. Chicago, IL" , }, "unit" : { "type" : "string" , "enum" : [ "celsius" , "fahrenheit" ]}, }, "required" : [ "location" ], }, } } ]; const response = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : messages , tools : tools , tool_choice : "auto" , }); console . log ( response ); } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.0-flash", "messages": [ { "role": "user", "content": "What' \' 's the weather like in Chicago today?" } ], "tools": [ { "type": "function", "function": { "name": "get_weather", "description": "Get the current weather in a given location", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "The city and state, e.g. Chicago, IL" }, "unit": { "type": "string", "enum": ["celsius", "fahrenheit"] } }, "required": ["location"] } } } ], "tool_choice": "auto" }' Image understanding Gemini models are natively multimodal and provide best in class performance on many common vision tasks . Python import base64 from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) # Function to encode the image def encode_image ( image_path ): with open ( image_path , "rb" ) as image_file : return base64 . b64encode ( \ No newline at end of file diff --git a/docstore/0957ba90-7e55-44fd-a15c-797eec41f3f5 b/docstore/0957ba90-7e55-44fd-a15c-797eec41f3f5 new file mode 100644 index 0000000000000000000000000000000000000000..7e8ad5dab4abf30afa5f2a7a1bb2bb58fcb2f5d0 --- /dev/null +++ b/docstore/0957ba90-7e55-44fd-a15c-797eec41f3f5 @@ -0,0 +1 @@ +-X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"' " $MIME_TYPE " '", "data": "' " $IMAGE_B64 " '" } }, {"text": "Caption this image."} ] }] }' 2 > /dev/null Note: Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly. Uploading images using the File API For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to generateContent . See the Files API guide for more information and examples. Python from google import genai client = genai . Client () my_file = client . files . upload ( file = "path/to/sample.jpg" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ my_file , "Caption this image." ], ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.jpg" , config : { mimeType : "image/jpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Caption this image." , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.jpg" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Caption this image." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * \ No newline at end of file diff --git a/docstore/09885987-84bd-4374-b19e-d8e5fb5edf52 b/docstore/09885987-84bd-4374-b19e-d8e5fb5edf52 new file mode 100644 index 0000000000000000000000000000000000000000..1b590529498df147a73d1ae4a22f439d08a36cb6 --- /dev/null +++ b/docstore/09885987-84bd-4374-b19e-d8e5fb5edf52 @@ -0,0 +1 @@ +from "node:fs" ; const ai = new GoogleGenAI ({}); const base64ImageFile = fs . readFileSync ( "path/to/small-sample.jpg" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "image/jpeg" , data : base64ImageFile , }, }, { text : "Caption this image." }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go bytes , _ := os . ReadFile ( "path/to/small-sample.jpg" ) parts := [] * genai . Part { genai . NewPartFromBytes ( bytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST IMG_PATH = "/path/to/your/image1.jpg" if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "' " $( base64 $B64FLAGS $IMG_PATH ) " '" } }, {"text": "Caption this image."}, ] }] }' 2 > /dev/null You can also fetch an image from a URL, convert it to bytes, and pass it to generateContent as shown in the following examples. Python from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests . get ( image_path ) . content image = types . Part . from_bytes ( data = image_bytes , mime_type = "image/jpeg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is this image?" , image ], ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; \ No newline at end of file diff --git a/docstore/09b2295a-7bad-48dd-b699-0097606ae12a b/docstore/09b2295a-7bad-48dd-b699-0097606ae12a new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/09b2295a-7bad-48dd-b699-0097606ae12a @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/09b62727-d96c-47ba-a1ef-0467b32b0435 b/docstore/09b62727-d96c-47ba-a1ef-0467b32b0435 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/09b62727-d96c-47ba-a1ef-0467b32b0435 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/09b836a3-695f-4607-a422-82f271bcede6 b/docstore/09b836a3-695f-4607-a422-82f271bcede6 new file mode 100644 index 0000000000000000000000000000000000000000..47c113cd9bacf84a434d531b6a240f0025b74130 --- /dev/null +++ b/docstore/09b836a3-695f-4607-a422-82f271bcede6 @@ -0,0 +1 @@ +. For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC. \ No newline at end of file diff --git a/docstore/09cb457a-8285-429c-98c7-28927497b8ab b/docstore/09cb457a-8285-429c-98c7-28927497b8ab new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/09cb457a-8285-429c-98c7-28927497b8ab @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/09d02012-c57c-4c2e-b5f2-b34b33ac8d13 b/docstore/09d02012-c57c-4c2e-b5f2-b34b33ac8d13 new file mode 100644 index 0000000000000000000000000000000000000000..e8472db3f1b37b3dc2d13dd06406d0e42fc75dfb --- /dev/null +++ b/docstore/09d02012-c57c-4c2e-b5f2-b34b33ac8d13 @@ -0,0 +1 @@ +costing 258 tokens. Tips and best practices Verify that images are correctly rotated. Use clear, non-blurry images. When using a single image with text, place the text prompt after the image part in the contents array. What's next This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources: Files API : Learn more about uploading and managing files for use with Gemini. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/0a00f553-f73e-480d-9e58-6f82c5eb7556 b/docstore/0a00f553-f73e-480d-9e58-6f82c5eb7556 new file mode 100644 index 0000000000000000000000000000000000000000..5b23b75839f7d9f5e86c0814ceb13216aba4c820 --- /dev/null +++ b/docstore/0a00f553-f73e-480d-9e58-6f82c5eb7556 @@ -0,0 +1 @@ +Using Gemini API keys | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Using Gemini API keys To use the Gemini API, you need an API key. You can create a key for free with a few clicks in Google AI Studio . Once you have an API key, you have the following options to connect to the Gemini API: Setting your API key as an environment variable Providing your API key explicitly For initial testing, you can hard code an API key, but this should only be temporary since it's not secure. You can find examples for hard coding the API key in Providing API key explicitly section. Setting API key as environment variable If you set the environment variable GEMINI_API_KEY or GOOGLE_API_KEY , the API key will automatically be picked up by the client when using one of the Gemini API libraries . It's recommended that you set only one of those variables, but if both are set, GOOGLE_API_KEY takes precedence. If you're using the REST API, or JavaScript on the browser, you will need to provide the API key explicitly. Here is how you can set your API key locally as the environment variable GEMINI_API_KEY with different operating systems. Linux/macOS - Bash Bash is a common Linux and macOS terminal configuration. You can check if you have a configuration file for it by running the following command: ~/.bashrc If the response is "No such file or directory", you will need to create this file and open it by running the following commands, or use zsh : touch ~/.bashrc open ~/.bashrc Next, you need to set your API key by adding the following export command: export GEMINI_API_KEY = After saving the file, apply the changes by running: source ~/.bashrc macOS \ No newline at end of file diff --git a/docstore/0a0dbe88-1069-41c3-8b4b-0753a2fe4103 b/docstore/0a0dbe88-1069-41c3-8b4b-0753a2fe4103 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/0a0dbe88-1069-41c3-8b4b-0753a2fe4103 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/0a16ce51-e85a-4f78-8b61-f87d67bbcade b/docstore/0a16ce51-e85a-4f78-8b61-f87d67bbcade new file mode 100644 index 0000000000000000000000000000000000000000..ca9fcd920a86bdee4d0b622e9ecd16eba0587472 --- /dev/null +++ b/docstore/0a16ce51-e85a-4f78-8b61-f87d67bbcade @@ -0,0 +1 @@ +[{"code_execution": {}}], "contents": [ { "role": "user", "parts": [{ "text": "Can you print \"Hello world!\"?" }] },{ "role": "model", "parts": [ { "text": "" }, { "executable_code": { "language": "PYTHON", "code": "\nprint(\"hello world!\")\n" } }, { "code_execution_result": { "outcome": "OUTCOME_OK", "output": "hello world!\n" } }, { "text": "I have printed \"hello world!\" using the provided python code block. \n" } ], },{ "role": "user", "parts": [{ "text": "What is the sum of the first 50 prime numbers? Generate and run code for the calculation, and make sure you get all 50." }] } ] }' Input/output (I/O) Starting with Gemini 2.0 Flash , code execution supports file input and graph output. Using these input and output capabilities, you can upload CSV and text files, ask questions about the files, and have Matplotlib graphs generated as part of the response. The output files are returned as inline images in the response. I/O pricing When using code execution I/O, you're charged for input tokens and output tokens: Input tokens: User prompt Output tokens: Code generated by the model Code execution output in the code environment Thinking tokens Summary generated by the model I/O details When you're working with code execution I/O, be aware of the following technical details: The maximum runtime of the code environment is 30 seconds. If the code environment generates an error, the model may decide to regenerate the code output. This can happen up to 5 times. The maximum file input size is limited by the model token window. In AI Studio, using Gemini Flash 2.0, the maximum input file size is 1 million tokens (roughly 2MB for text files of the supported input types). If you upload a file that's too large, AI Studio won't let you send it. Code execution works best with text and CSV files. The input file can be passed in part.inlineData or part.fileData (uploaded via the Files API ), and the output file is always returned as part.inlineData . Single turn Bidirectional \ No newline at end of file diff --git a/docstore/0a33c064-6d78-4124-995e-5c0e8437c830 b/docstore/0a33c064-6d78-4124-995e-5c0e8437c830 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/0a33c064-6d78-4124-995e-5c0e8437c830 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/0a3c12c2-e57f-41c7-8109-bc19128eff9a b/docstore/0a3c12c2-e57f-41c7-8109-bc19128eff9a new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/0a3c12c2-e57f-41c7-8109-bc19128eff9a @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/0a5df3e0-3a45-49ee-b7c2-fbf533f0ee80 b/docstore/0a5df3e0-3a45-49ee-b7c2-fbf533f0ee80 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/0a5df3e0-3a45-49ee-b7c2-fbf533f0ee80 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/0a7d4c12-745b-4794-9219-ca9a6ec47e57 b/docstore/0a7d4c12-745b-4794-9219-ca9a6ec47e57 new file mode 100644 index 0000000000000000000000000000000000000000..dd1226540612f04fb3f971567b47c61067071189 --- /dev/null +++ b/docstore/0a7d4c12-745b-4794-9219-ca9a6ec47e57 @@ -0,0 +1 @@ +This example shows you how to specify a subject description. Subject description Prompt Generated output The description can include a subject, or multiple subjects and actions. Here, our subject is "white concrete apartment building." An architectural rendering of a white concrete apartment building with flowing organic shapes, seamlessly blending with lush greenery and futuristic elements Context This example shows you how to specify context. Context Prompt Generated output The background or context in which the subject will be placed is very important. Try placing your subject in a variety of backgrounds like on a busy street, or in outer space. A satellite floating through outer space with the moon and some stars in the background. Action This example shows you how to specify action. Action Prompt Generated output What is the subject doing like walking, running, or turning their head. A wide shot of a woman walking along the beach, looking content and relaxed towards the horizon at sunset. Style This example shows you how to specify style. Style Prompt Generated output You can add keywords to improve generation quality and steer it closer to intended style, such as shallow depth of field, movie still, minimalistic, surreal, vintage, futuristic, or double-exposure. Film noir style, man and woman walk on the street, mystery, cinematic, black and white. Camera motion This example shows you how to specify camera motion. Camera motion Prompt Generated output Options for camera motion include POV shot, aerial view, tracking drone view, or tracking shot. A POV shot from a vintage car driving in the rain, Canada at night, cinematic. Composition This example shows you how to specify composition. Composition Prompt Generated output How the shot is framed (wide shot, close-up, low angle). Extreme close-up of a an eye with city reflected in it. Create a video of a wide shot of surfer walking on a beach with a surfboard, beautiful sunset, cinematic. Ambiance This example \ No newline at end of file diff --git a/docstore/0a911ecf-a88a-4578-ae79-8c835e68adb1 b/docstore/0a911ecf-a88a-4578-ae79-8c835e68adb1 new file mode 100644 index 0000000000000000000000000000000000000000..0d6f2bf32b918d3f85636fdb1c53070e3d5509f6 --- /dev/null +++ b/docstore/0a911ecf-a88a-4578-ae79-8c835e68adb1 @@ -0,0 +1 @@ +overlay_filename = f " { item [ 'label' ] } _ { i } _overlay.png" mask . save ( os . path . join ( output_dir , mask_filename )) # Create and save overlay composite = Image . alpha_composite ( im . convert ( 'RGBA' ), overlay ) composite . save ( os . path . join ( output_dir , overlay_filename )) print ( f "Saved mask and overlay for { item [ 'label' ] } to { output_dir } " ) # Example usage if __name__ == "__main__" : extract_segmentation_masks ( "path/to/image.png" ) Check the segmentation example in the cookbook guide for a more detailed example. An example segmentation output with objects and segmentation masks Supported image formats Gemini supports the following image format MIME types: PNG - image/png JPEG - image/jpeg WEBP - image/webp HEIC - image/heic HEIF - image/heif Capabilities All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation. Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements. Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities: Gemini 2.0 models are further trained to support enhanced object detection . Gemini 2.5 models are further trained to support enhanced segmentation in addition to object detection . Limitations and key technical information File limit Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request. Token calculation Gemini 1.5 Flash and Gemini 1.5 Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens. Gemini 2.0 Flash and Gemini 2.5 Flash/Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each \ No newline at end of file diff --git a/docstore/0a95df86-cddf-4be5-b0af-b0e920263538 b/docstore/0a95df86-cddf-4be5-b0af-b0e920263538 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/0a95df86-cddf-4be5-b0af-b0e920263538 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/0aa3b7b0-59fd-4726-868a-359db9ff8e8b b/docstore/0aa3b7b0-59fd-4726-868a-359db9ff8e8b new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/0aa3b7b0-59fd-4726-868a-359db9ff8e8b @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/0b7aac16-a063-4c10-a7b4-4a8be20746b5 b/docstore/0b7aac16-a063-4c10-a7b4-4a8be20746b5 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/0b7aac16-a063-4c10-a7b4-4a8be20746b5 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/0b860521-2fea-4af6-86f4-34d118c10096 b/docstore/0b860521-2fea-4af6-86f4-34d118c10096 new file mode 100644 index 0000000000000000000000000000000000000000..045707d455060dfd20be0644c14272aa57ff277b --- /dev/null +++ b/docstore/0b860521-2fea-4af6-86f4-34d118c10096 @@ -0,0 +1 @@ +"log" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } prompt := "Explain the concept of Occam's Razor and provide a simple, everyday example." model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , genai . Text ( prompt ), nil ) fmt . Println ( resp . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain the concept of Occam\' s Razor and provide a simple, everyday example. " } ] } ] }' ``` Thinking budgets The thinkingBudget parameter guides the model on the number of thinking tokens to use when generating a response. A higher token count generally allows for more detailed reasoning, which can be beneficial for tackling more complex tasks . If latency is more important, use a lower budget or disable thinking by setting thinkingBudget to 0. Setting the thinkingBudget to -1 turns on dynamic thinking , meaning the model will adjust the budget based on the complexity of the request. The thinkingBudget is only supported in Gemini 2.5 Flash, 2.5 Pro, and 2.5 Flash-Lite. Depending on the prompt, the model might overflow or underflow the token budget. The following are thinkingBudget configuration details for each model type. Model Default setting (Thinking budget is not set) Range Disable thinking Turn on dynamic thinking 2.5 Pro Dynamic thinking: Model decides when and how much to think 128 to 32768 N/A: Cannot disable thinking thinkingBudget = -1 2.5 Flash Dynamic thinking: Model decides when and how much to think 0 to 24576 thinkingBudget = 0 thinkingBudget = -1 2.5 Flash Lite Model does not think 512 to 24576 thinkingBudget = 0 thinkingBudget = -1 Python from google import genai from google.genai import types client = genai . \ No newline at end of file diff --git a/docstore/0b88782e-a863-4d58-983c-618773acc759 b/docstore/0b88782e-a863-4d58-983c-618773acc759 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/0b88782e-a863-4d58-983c-618773acc759 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/0b8c1ed7-7dfd-4cb3-b0c5-ee0cb0ee8afe b/docstore/0b8c1ed7-7dfd-4cb3-b0c5-ee0cb0ee8afe new file mode 100644 index 0000000000000000000000000000000000000000..9bdceff10febb2c20e655c9c51c8df9e3aef858d --- /dev/null +++ b/docstore/0b8c1ed7-7dfd-4cb3-b0c5-ee0cb0ee8afe @@ -0,0 +1 @@ +Using different words or phrasing in your prompts often yields different responses from the model even though they all mean the same thing. If you're not getting the expected results from your prompt, try rephrasing it. Version 1: How do I bake a pie? Version 2: Suggest a recipe for a pie. Version 3: What's a good pie recipe? Switch to an analogous task: If you can't get the model to follow your instructions for a task, try giving it instructions for an analogous task that achieves the same result. This prompt tells the model to categorize a book by using predefined categories: Prompt: Which category does The Odyssey belong to: thriller sci-fi mythology biography Response: The Odyssey belongs to the category of **mythology**. Here's why: * **Mythology:** The Odyssey tells the story of Odysseus, a hero from Greek mythology, and his journey home after the Trojan War. It features gods, monsters, and supernatural events common to Greek mythology. ..... (gemini-2.5-flash) The response is correct, but the model didn't stay within the bounds of the options. You also want to model to just respond with one of the options instead of in a full sentence. In this case, you can rephrase the instructions as a multiple choice question and ask the model to choose an option. Prompt: Multiple choice problem: Which of the following options describes the book The Odyssey? Options: thriller sci-fi mythology biography Response: The correct answer is mythology . (gemini-2.5-flash) Change the order of prompt content: The order of the content in the prompt can sometimes affect the response. Try changing the content order and see how that affects the response. Version 1 : [ examples ] [ context ] [ input ] Version 2 : [ input ] [ examples ] [ context ] Version 3 : [ examples ] [ input ] [ context ] Fallback responses A fallback response is a response returned by the model when either the prompt or the response triggers a safety filter. An example of a fallback response is "I'm not able to \ No newline at end of file diff --git a/docstore/0b9a46ee-ef88-4343-9de0-d1caf75fec72 b/docstore/0b9a46ee-ef88-4343-9de0-d1caf75fec72 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/0b9a46ee-ef88-4343-9de0-d1caf75fec72 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/0bbafd9d-07cf-4a27-b8fd-011e7a6fb1fb b/docstore/0bbafd9d-07cf-4a27-b8fd-011e7a6fb1fb new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/0bbafd9d-07cf-4a27-b8fd-011e7a6fb1fb @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/0bcbd714-9ef9-4609-8a9d-ed0165082a03 b/docstore/0bcbd714-9ef9-4609-8a9d-ed0165082a03 new file mode 100644 index 0000000000000000000000000000000000000000..e7df3841ca47d093238d5ecfeb9f7cc95942f8e5 --- /dev/null +++ b/docstore/0bcbd714-9ef9-4609-8a9d-ed0165082a03 @@ -0,0 +1 @@ +data image2Path := "path/to/image2.jpeg" imgBytes , _ := os . ReadFile ( image2Path ) parts := [] * genai . Part { genai . NewPartFromText ( "What is different between these two images?" ), genai . NewPartFromBytes ( imgBytes , "image/jpeg" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST # Upload the first image IMAGE1_PATH = "path/to/image1.jpg" MIME1_TYPE = $( file -b --mime-type " ${ IMAGE1_PATH } " ) NUM1_BYTES = $( wc -c < " ${ IMAGE1_PATH } " ) DISPLAY_NAME1 = IMAGE1 tmp_header_file1 = upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME1_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME1 } '}}" 2 > /dev/null upload_url1 = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file1 } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file1 } " curl " ${ upload_url1 } " \ -H "Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE1_PATH } " 2 > /dev/null > file_info1.json file1_uri = $( jq ".file.uri" file_info1.json ) echo file1_uri = $file1_uri # Prepare the second image (inline) IMAGE2_PATH = "path/to/image2.png" MIME2_TYPE = $( file -b --mime-type " ${ IMAGE2_PATH } " ) if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMAGE2_BASE64 = $( base64 $B64FLAGS $IMAGE2_PATH ) # Now generate content using both images curl \ No newline at end of file diff --git a/docstore/0bcdf8ad-4132-4fe5-8598-dadd2a943ef7 b/docstore/0bcdf8ad-4132-4fe5-8598-dadd2a943ef7 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/0bcdf8ad-4132-4fe5-8598-dadd2a943ef7 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/0bf274fe-edf6-4873-8f7d-1f30ca97e762 b/docstore/0bf274fe-edf6-4873-8f7d-1f30ca97e762 new file mode 100644 index 0000000000000000000000000000000000000000..9670cc28c51e28ab1129c38367a73e00f33ff60c --- /dev/null +++ b/docstore/0bf274fe-edf6-4873-8f7d-1f30ca97e762 @@ -0,0 +1 @@ +meal features brown rice, roasted vegetables, and chicken teriyaki. The brown rice is a healthy and complex carbohydrate that will give you sustained energy. The roasted vegetables are a great way to get your daily dose of vitamins and minerals, and the chicken teriyaki is a delicious and protein-rich option. This meal is also very easy to prepare. Simply cook the brown rice, roast the vegetables, and cook the chicken teriyaki. Once everything is cooked, divide it into meal prep containers and store them in the refrigerator. You can then grab a container and go on busy mornings! If you're looking for a healthy and delicious meal that can be easily prepped ahead of time, this meal is a great option. It's packed with nutrients and flavor, and it's sure to keep you feeling full and satisfied. Here's to healthy and delicious meal prepping! If you are having trouble getting the output you want from prompts that use media files, there are some strategies that can help you get the results you want. The following sections provide design approaches and troubleshooting tips for improving prompts that use multimodal input. You can improve your multimodal prompts by following these best practices: Prompt design fundamentals Be specific in your instructions : Craft clear and concise instructions that leave minimal room for misinterpretation. Add a few examples to your prompt: Use realistic few-shot examples to illustrate what you want to achieve. Break it down step-by-step : Divide complex tasks into manageable sub-goals, guiding the model through the process. Specify the output format : In your prompt, ask for the output to be in the format you want, like markdown, JSON, HTML and more. Put your image first for single-image prompts : While Gemini can handle image and text inputs in any order, for prompts containing a single image, it might perform better if that image (or video) is placed before the text prompt. However, for prompts that require images to be highly interleaved \ No newline at end of file diff --git a/docstore/0c01088f-3f70-410f-bc51-829acb540a50 b/docstore/0c01088f-3f70-410f-bc51-829acb540a50 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/0c01088f-3f70-410f-bc51-829acb540a50 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/0c0a78de-2ac7-4cdf-8cb4-69fdb8848a17 b/docstore/0c0a78de-2ac7-4cdf-8cb4-69fdb8848a17 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/0c0a78de-2ac7-4cdf-8cb4-69fdb8848a17 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/0c0fc134-540f-49a7-83a3-2e21aa391c3d b/docstore/0c0fc134-540f-49a7-83a3-2e21aa391c3d new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/0c0fc134-540f-49a7-83a3-2e21aa391c3d @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/0c122203-e59e-4fed-acb9-6c4d75728e99 b/docstore/0c122203-e59e-4fed-acb9-6c4d75728e99 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/0c122203-e59e-4fed-acb9-6c4d75728e99 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/0c14e060-ff67-4273-a221-c1469b636e33 b/docstore/0c14e060-ff67-4273-a221-c1469b636e33 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/0c14e060-ff67-4273-a221-c1469b636e33 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/0c174fa8-13dc-4a1c-9e38-d17983f9b0a1 b/docstore/0c174fa8-13dc-4a1c-9e38-d17983f9b0a1 new file mode 100644 index 0000000000000000000000000000000000000000..6ba79c2c7031e707d89e2b23470a5d9ee375cc5c --- /dev/null +++ b/docstore/0c174fa8-13dc-4a1c-9e38-d17983f9b0a1 @@ -0,0 +1 @@ +mcp.client.stdio import stdio_client from google import genai client = genai . Client () # Create server parameters for stdio connection server_params = StdioServerParameters ( command = "npx" , # Executable args = [ "-y" , "@philschmid/weather-mcp" ], # MCP Server env = None , # Optional environment variables ) async def run (): async with stdio_client ( server_params ) as ( read , write ): async with ClientSession ( read , write ) as session : # Prompt to get the weather for the current day in London. prompt = f "What is the weather in London in { datetime . now () . strftime ( '%Y-%m- %d ' ) } ?" # Initialize the connection between client and server await session . initialize () # Send request to the model with MCP function declarations response = await client . aio . models . generate_content ( model = "gemini-2.5-flash" , contents = prompt , config = genai . types . GenerateContentConfig ( temperature = 0 , tools = [ session ], # uses the session, will automatically call the tool # Uncomment if you **don't** want the SDK to automatically call the tool # automatic_function_calling=genai.types.AutomaticFunctionCallingConfig( # disable=True # ), ), ) print ( response . text ) # Start the asyncio event loop and run the main function asyncio . run ( run ()) JavaScript Make sure the latest version of the mcp SDK is installed on your platform of choice. npm install @modelcontextprotocol/sdk Note: JavaScript supports automatic tool calling by wrapping the client with mcpToTool . If you want to disable it, you can provide automaticFunctionCalling with disabled true . import { GoogleGenAI , FunctionCallingConfigMode , mcpToTool } from '@google/genai' ; import { Client } from "@modelcontextprotocol/sdk/client/index.js" ; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js" ; // Create server parameters for stdio connection const serverParams = new StdioClientTransport ({ command : "npx" , // Executable args : [ "-y" , "@philschmid/weather-mcp" \ No newline at end of file diff --git a/docstore/0c297b6e-bb5a-4849-bf3a-c80c0388a230 b/docstore/0c297b6e-bb5a-4849-bf3a-c80c0388a230 new file mode 100644 index 0000000000000000000000000000000000000000..0a87c36dd70c54ff062f2d96388cf5344fb768a1 --- /dev/null +++ b/docstore/0c297b6e-bb5a-4849-bf3a-c80c0388a230 @@ -0,0 +1 @@ +process the response and check for Function Call, if Yes : Extract the name and args of the function and execute the corresponding function in your application. No: The model has provided a direct text response to the prompt (this flow is less emphasized in the example but is a possible outcome). Create User friendly response: If a function was executed, capture the result and send it back to the model in a subsequent turn of the conversation. It will use the result to generate a final, user-friendly response that incorporates the information from the function call. This process can be repeated over multiple turns, allowing for complex interactions and workflows. The model also supports calling multiple functions in a single turn ( parallel function calling ) and in sequence ( compositional function calling ). Step 1: Define a function declaration Define a function and its declaration within your application code that allows users to set light values and make an API request. This function could call external services or APIs. Python # Define a function that the model can call to control smart lights set_light_values_declaration = { "name" : "set_light_values" , "description" : "Sets the brightness and color temperature of a light." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "integer" , "description" : "Light level from 0 to 100. Zero is off and 100 is full brightness" , }, "color_temp" : { "type" : "string" , "enum" : [ "daylight" , "cool" , "warm" ], "description" : "Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`." , }, }, "required" : [ "brightness" , "color_temp" ], }, } # This is the actual function that would be called based on the model's suggestion def set_light_values ( brightness : int , color_temp : str ) - > dict [ str , int | str ]: """Set the brightness and color temperature of a room light. (mock API). Args: brightness: Light level from 0 to 100. Zero is off and 100 is full \ No newline at end of file diff --git a/docstore/0c432c30-5c2c-4635-a929-367cc930508e b/docstore/0c432c30-5c2c-4635-a929-367cc930508e new file mode 100644 index 0000000000000000000000000000000000000000..2446800bd40f0c531de753980e82d2c182023a25 --- /dev/null +++ b/docstore/0c432c30-5c2c-4635-a929-367cc930508e @@ -0,0 +1 @@ +how you'll trade off if a change leads to improvements for one metric to the detriment of another. Like with other performance engineering, you may want to focus on worst-case performance across your evaluation set rather than average performance. Adversarial testing involves proactively trying to break your application. The goal is to identify points of weakness so that you can take steps to remedy them as appropriate. Adversarial testing can take significant time/effort from evaluators with expertise in your application — but the more you do, the greater your chance of spotting problems, especially those occurring rarely or only after repeated runs of the application. Adversarial testing is a method for systematically evaluating an ML model with the intent of learning how it behaves when provided with malicious or inadvertently harmful input: An input may be malicious when the input is clearly designed to produce an unsafe or harmful output-- for example, asking a text generation model to generate a hateful rant about a particular religion. An input is inadvertently harmful when the input itself may be innocuous, but produces harmful output -- for example, asking a text generation model to describe a person of a particular ethnicity and receiving a racist output. What distinguishes an adversarial test from a standard evaluation is the composition of the data used for testing. For adversarial tests, select test data that is most likely to elicit problematic output from the model. This means probing the model's behavior for all the types of harms that are possible, including rare or unusual examples and edge-cases that are relevant to safety policies. It should also include diversity in the different dimensions of a sentence such as structure, meaning and length. You can refer to the Google's Responsible AI practices in fairness for more details on what to consider when building a test dataset. Advanced tips Use automated testing instead of the traditional method of \ No newline at end of file diff --git a/docstore/0c54425f-ce4d-41c4-a18d-7b3fb6925df8 b/docstore/0c54425f-ce4d-41c4-a18d-7b3fb6925df8 new file mode 100644 index 0000000000000000000000000000000000000000..c19e2d1f2a8624bd1e7529d33c658d3154e40942 --- /dev/null +++ b/docstore/0c54425f-ce4d-41c4-a18d-7b3fb6925df8 @@ -0,0 +1 @@ +response. A token is approximately four characters. 100 tokens correspond to roughly 60-80 words. Temperature: The temperature controls the degree of randomness in token selection. The temperature is used for sampling during response generation, which occurs when topP and topK are applied. Lower temperatures are good for prompts that require a more deterministic or less open-ended response, while higher temperatures can lead to more diverse or creative results. A temperature of 0 is deterministic, meaning that the highest probability response is always selected. topK : The topK parameter changes how the model selects tokens for output. A topK of 1 means the selected token is the most probable among all the tokens in the model's vocabulary (also called greedy decoding), while a topK of 3 means that the next token is selected from among the 3 most probable using the temperature. For each token selection step, the topK tokens with the highest probabilities are sampled. Tokens are then further filtered based on topP with the final token selected using temperature sampling. topP : The topP parameter changes how the model selects tokens for output. Tokens are selected from the most to least probable until the sum of their probabilities equals the topP value. For example, if tokens A, B, and C have a probability of 0.3, 0.2, and 0.1 and the topP value is 0.5, then the model will select either A or B as the next token by using the temperature and exclude C as a candidate. The default topP value is 0.95. stop_sequences : Set a stop sequence to tell the model to stop generating content. A stop sequence can be any sequence of characters. Try to avoid using a sequence of characters that may appear in the generated content. Prompt iteration strategies Prompt design can sometimes require a few iterations before you consistently get the response you're looking for. This section provides guidance on some things you can try when iterating on your prompts: Use different phrasing: \ No newline at end of file diff --git a/docstore/0c738c8b-7d29-46c7-bf54-a4a306abf12c b/docstore/0c738c8b-7d29-46c7-bf54-a4a306abf12c new file mode 100644 index 0000000000000000000000000000000000000000..40564cc3a339b41e3f9c5a2f24a7d0082d31abf9 --- /dev/null +++ b/docstore/0c738c8b-7d29-46c7-bf54-a4a306abf12c @@ -0,0 +1 @@ +response_modalities = [ "AUDIO" ], context_window_compression = ( # Configures compression with default parameters. types . ContextWindowCompressionConfig ( sliding_window = types . SlidingWindow (), ) ), ) JavaScript const config = { responseModalities : [ Modality . AUDIO ], contextWindowCompression : { slidingWindow : {} } }; Session resumption To prevent session termination when the server periodically resets the WebSocket connection, configure the sessionResumption field within the setup configuration . Passing this configuration causes the server to send SessionResumptionUpdate messages, which can be used to resume the session by passing the last resumption token as the SessionResumptionConfig.handle of the subsequent connection. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" async def main (): print ( f "Connecting to the service with handle { previous_session_handle } ..." ) async with client . aio . live . connect ( model = model , config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], session_resumption = types . SessionResumptionConfig ( # The handle of the session to resume is passed here, # or else None to start a new session. handle = previous_session_handle ), ), ) as session : while True : await session . send_client_content ( turns = types . Content ( role = "user" , parts = [ types . Part ( text = "Hello world!" )] ) ) async for message in session . receive (): # Periodically, the server will send update messages that may # contain a handle for the current state of the session. if message . session_resumption_update : update = message . session_resumption_update if update . resumable and update . new_handle : # The handle should be retained and linked to the session. return update . new_handle # For the purposes of this example, placeholder input is continually fed # to the model. In non-sample code, the model inputs would come from # \ No newline at end of file diff --git a/docstore/0c75f20a-b6aa-4287-bde5-010c67b6adf3 b/docstore/0c75f20a-b6aa-4287-bde5-010c67b6adf3 new file mode 100644 index 0000000000000000000000000000000000000000..65337d81cbf9fba76eb8d44ddc68611350b61de7 --- /dev/null +++ b/docstore/0c75f20a-b6aa-4287-bde5-010c67b6adf3 @@ -0,0 +1 @@ += lambda s : s . segment . end_index , reverse = True ) for support in sorted_supports : end_index = support . segment . end_index if support . grounding_chunk_indices : # Create citation string like [1](link1)[2](link2) citation_links = [] for i in support . grounding_chunk_indices : if i < len ( chunks ): uri = chunks [ i ] . web . uri citation_links . append ( f "[ { i + 1 } ]( { uri } )" ) citation_string = ", " . join ( citation_links ) text = text [: end_index ] + citation_string + text [ end_index :] return text # Assuming response with grounding metadata text_with_citations = add_citations ( response ) print ( text_with_citations ) JavaScript function addCitations ( response ) { let text = response . text ; const supports = response . candidates [ 0 ] ? . groundingMetadata ? . groundingSupports ; const chunks = response . candidates [ 0 ] ? . groundingMetadata ? . groundingChunks ; // Sort supports by end_index in descending order to avoid shifting issues when inserting. const sortedSupports = [... supports ]. sort ( ( a , b ) = > ( b . segment ? . endIndex ?? 0 ) - ( a . segment ? . endIndex ?? 0 ), ); for ( const support of sortedSupports ) { const endIndex = support . segment ? . endIndex ; if ( endIndex === undefined || ! support . groundingChunkIndices ? . length ) { continue ; } const citationLinks = support . groundingChunkIndices . map ( i = > { const uri = chunks [ i ] ? . web ? . uri ; if ( uri ) { return `[ ${ i + 1 } ]( ${ uri } )` ; } return null ; }) . filter ( Boolean ); if ( citationLinks . length > 0 ) { const citationString = citationLinks . join ( ", " ); text = text . slice ( 0 , endIndex ) + citationString + text . slice ( endIndex ); } } return text ; } const textWithCitations = addCitations ( response ); console . log ( textWithCitations ); The new response with inline citations will look like this: Spain won Euro 2024, defeating England 2-1 in the final.[1](https:/...), [2](https:/...), [4](https:/...), [5](https:/...) This victory \ No newline at end of file diff --git a/docstore/0c770e07-37d7-4c3a-b1dc-9d7e7e20d6ee b/docstore/0c770e07-37d7-4c3a-b1dc-9d7e7e20d6ee new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/0c770e07-37d7-4c3a-b1dc-9d7e7e20d6ee @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/0c7fde01-3fbd-473e-915a-132dcf37ce55 b/docstore/0c7fde01-3fbd-473e-915a-132dcf37ce55 new file mode 100644 index 0000000000000000000000000000000000000000..1d5a02022906f295c3ad625acee2d3f5c63827ae --- /dev/null +++ b/docstore/0c7fde01-3fbd-473e-915a-132dcf37ce55 @@ -0,0 +1 @@ +Aoede -- Breezy Callirrhoe -- Easy-going Autonoe -- Bright Enceladus -- Breathy Iapetus -- Clear Umbriel -- Easy-going Algieba -- Smooth Despina -- Smooth Erinome -- Clear Algenib -- Gravelly Rasalgethi -- Informative Laomedeia -- Upbeat Achernar -- Soft Alnilam -- Firm Schedar -- Even Gacrux -- Mature Pulcherrima -- Forward Achird -- Friendly Zubenelgenubi -- Casual Vindemiatrix -- Gentle Sadachbia -- Lively Sadaltager -- Knowledgeable Sulafat -- Warm You can hear all the voice options in AI Studio . Supported languages The TTS models detect the input language automatically. They support the following 24 languages: Language BCP-47 Code Language BCP-47 Code Arabic (Egyptian) ar-EG German (Germany) de-DE English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Korean (Korea) ko-KR Portuguese (Brazil) pt-BR Russian (Russia) ru-RU Dutch (Netherlands) nl-NL Polish (Poland) pl-PL Thai (Thailand) th-TH Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Romanian (Romania) ro-RO Ukrainian (Ukraine) uk-UA Bengali (Bangladesh) bn-BD English (India) en-IN & hi-IN bundle Marathi (India) mr-IN Tamil (India) ta-IN Telugu (India) te-IN Supported models Model Single speaker Multispeaker Gemini 2.5 Flash Preview TTS ✔️ ✔️ Gemini 2.5 Pro Preview TTS ✔️ ✔️ Limitations TTS models can only receive text inputs and generate audio outputs. A TTS session has a context window limit of 32k tokens. Review Languages section for language support. What's next Try the audio generation cookbook . Gemini's Live API offers interactive audio generation options you can interleave with other modalities. For working with audio inputs , visit the Audio understanding guide. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site \ No newline at end of file diff --git a/docstore/0d107ff9-62d1-4d9f-997e-9ef736a4d809 b/docstore/0d107ff9-62d1-4d9f-997e-9ef736a4d809 new file mode 100644 index 0000000000000000000000000000000000000000..d73a03ac64bf52901f07bf0a8fe4fc21e47f6048 --- /dev/null +++ b/docstore/0d107ff9-62d1-4d9f-997e-9ef736a4d809 @@ -0,0 +1 @@ +are used in a variety of common AI use cases, such as: Information retrieval: You can use embeddings to retrieve semantically similar text given a piece of input text. Document search tutorial task Clustering: Comparing groups of embeddings can help identify hidden trends. Embedding clustering tutorial bubble_chart Vector database: As you take different embedding use cases to production, it is common to store embeddings in a vector database. Vector database tutorial bolt Classification: You can train a model using embeddings to classify documents into categories. Classification tutorial token Embedding models The Gemini API offers three models that generate text embeddings: gemini-embedding-exp-03-07 text-embedding-004 embedding-001 What's next Check out the embeddings quickstart notebook . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/0d159d13-aa6f-49b9-a0f2-294937827c2e b/docstore/0d159d13-aa6f-49b9-a0f2-294937827c2e new file mode 100644 index 0000000000000000000000000000000000000000..64b38d40afbaa776eeced04508049a0f469e337d --- /dev/null +++ b/docstore/0d159d13-aa6f-49b9-a0f2-294937827c2e @@ -0,0 +1 @@ +OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function main () { const messages = [{ "role" : "user" , "content" : "What's the weather like in Chicago today?" }]; const tools = [ { "type" : "function" , "function" : { "name" : "get_weather" , "description" : "Get the weather in a given location" , "parameters" : { "type" : "object" , "properties" : { "location" : { "type" : "string" , "description" : "The city and state, e.g. Chicago, IL" , }, "unit" : { "type" : "string" , "enum" : [ "celsius" , "fahrenheit" ]}, }, "required" : [ "location" ], }, } } ]; const response = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : messages , tools : tools , tool_choice : "auto" , }); console . log ( response ); } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.0-flash", "messages": [ { "role": "user", "content": "What' \' 's the weather like in Chicago today?" } ], "tools": [ { "type": "function", "function": { "name": "get_weather", "description": "Get the current weather in a given location", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "The city and state, e.g. Chicago, IL" }, "unit": { "type": "string", "enum": ["celsius", "fahrenheit"] } }, "required": ["location"] } } } ], "tool_choice": "auto" }' Image understanding Gemini models are natively multimodal and provide best in class performance on many common vision tasks . Python import base64 from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) # Function to encode the image def encode_image ( image_path ): with open ( image_path , "rb" ) as image_file : return base64 . b64encode ( \ No newline at end of file diff --git a/docstore/0d5512ee-fea5-4bd4-8baa-e24e2f8e815b b/docstore/0d5512ee-fea5-4bd4-8baa-e24e2f8e815b new file mode 100644 index 0000000000000000000000000000000000000000..b0033fbc695240330e92f0eacef1e843c48482b9 --- /dev/null +++ b/docstore/0d5512ee-fea5-4bd4-8baa-e24e2f8e815b @@ -0,0 +1 @@ +Supported values are "16:9" and "9:16" . The default is "16:9" . personGeneration : Allow the model to generate videos of people. The following values are supported: Text-to-video generation: "dont_allow" : Don't allow the inclusion of people or faces. "allow_adult" : Generate videos that include adults, but not children. "allow_all" : Generate videos that include adults and children. Image-to-video generation: "dont_allow" : Don't allow the inclusion of people or faces. "allow_adult" : Generate videos that include adults, but not children. See Limitations . numberOfVideos : Output videos requested, either 1 or 2 . durationSeconds : Length of each output video in seconds, between 5 and 8 . enhance_prompt : Enable or disable the prompt rewriter. Enabled by default. Specifications Modalities Text-to-video generation Image-to-video generation Request latency Min: 11 seconds Max: 6 minutes (during peak hours) Variable length generation 5-8 seconds Resolution 720p Frame rate 24fps Aspect ratio 16:9 - landscape 9:16 - portrait Input languages (text-to-video) English Limitations Image-to-video personGeneration is not allowed in EU, UK, CH, MENA locations Text-to-video personGeneration: "allow_all" is not allowed in EU, UK, CH, MENA locations Note: Check out the Models , Pricing , and Rate limits pages for more usage limitations for Veo. Videos created by Veo are watermarked using SynthID , our tool for watermarking and identifying AI-generated content, and are passed through safety filters and memorization checking processes that help mitigate privacy, copyright and bias risks. Things to try To get the most out of Veo, incorporate video-specific terminology into your prompts. Veo understands a wide range of terms related to: Shot composition: Specify the framing and number of subjects in the shot (e.g., "single shot," "two shot," "over-the-shoulder shot"). Camera positioning and movement: Control the camera's location and movement using terms like "eye level," "high \ No newline at end of file diff --git a/docstore/0d6909bc-dfd0-4874-8c3e-cbba8a857840 b/docstore/0d6909bc-dfd0-4874-8c3e-cbba8a857840 new file mode 100644 index 0000000000000000000000000000000000000000..ebc8fdc5ad27fd96758924c177eadfccc4d6556f --- /dev/null +++ b/docstore/0d6909bc-dfd0-4874-8c3e-cbba8a857840 @@ -0,0 +1 @@ +Structured output | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Structured output You can configure Gemini for structured output instead of unstructured text, allowing precise extraction and standardization of information for further processing. For example, you can use structured output to extract information from resumes, standardize them to build a structured database. Gemini can generate either JSON or enum values as structured output. Generating JSON There are two ways to generate JSON using the Gemini API: Configure a schema on the model Provide a schema in a text prompt Configuring a schema on the model is the recommended way to generate JSON, because it constrains the model to output JSON. Configuring a schema (recommended) To constrain the model to generate JSON, configure a responseSchema . The model will then respond to any prompt with JSON-formatted output. Python from google import genai from pydantic import BaseModel class Recipe ( BaseModel ): recipe_name : str ingredients : list [ str ] client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "List a few popular cookie recipes, and include the amounts of ingredients." , config = { "response_mime_type" : "application/json" , "response_schema" : list [ Recipe ], }, ) # Use the response as a JSON string. print ( response . text ) # Use instantiated objects. my_recipes : list [ Recipe ] = response . parsed Note: Pydantic validators are not yet supported. If a pydantic.ValidationError occurs, it is suppressed, and .parsed may be empty/null. JavaScript import { GoogleGenAI , Type } from "@google/genai" ; const ai = \ No newline at end of file diff --git a/docstore/0d79afbe-75f8-436e-acfd-f9b31c067f73 b/docstore/0d79afbe-75f8-436e-acfd-f9b31c067f73 new file mode 100644 index 0000000000000000000000000000000000000000..a3f65441166ab1d39d6f723c5ce4c04b54ac95e6 --- /dev/null +++ b/docstore/0d79afbe-75f8-436e-acfd-f9b31c067f73 @@ -0,0 +1 @@ +values, use an enum. Tool Selection: While the model can use an arbitrary number of tools, providing too many can increase the risk of selecting an incorrect or suboptimal tool. For best results, aim to provide only the relevant tools for the context or task, ideally keeping the active set to a maximum of 10-20. Consider dynamic tool selection based on conversation context if you have a large total number of tools. Prompt Engineering: Provide context: Tell the model its role (e.g., "You are a helpful weather assistant."). Give instructions: Specify how and when to use functions (e.g., "Don't guess dates; always use a future date for forecasts."). Encourage clarification: Instruct the model to ask clarifying questions if needed. Temperature: Use a low temperature (e.g., 0) for more deterministic and reliable function calls. Validation: If a function call has significant consequences (e.g., placing an order), validate the call with the user before executing it. Error Handling : Implement robust error handling in your functions to gracefully handle unexpected inputs or API failures. Return informative error messages that the model can use to generate helpful responses to the user. Security: Be mindful of security when calling external APIs. Use appropriate authentication and authorization mechanisms. Avoid exposing sensitive data in function calls. Token Limits: Function descriptions and parameters count towards your input token limit. If you're hitting token limits, consider limiting the number of functions or the length of the descriptions, break down complex tasks into smaller, more focused function sets. Notes and limitations Only a subset of the OpenAPI schema is supported. Supported parameter types in Python are limited. Automatic function calling is a Python SDK feature only. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License \ No newline at end of file diff --git a/docstore/0d81a67d-2c03-4891-952b-de4fb85f47fd b/docstore/0d81a67d-2c03-4891-952b-de4fb85f47fd new file mode 100644 index 0000000000000000000000000000000000000000..13dae0738dbe1cc658da4f8d80ffe7f33c50362c --- /dev/null +++ b/docstore/0d81a67d-2c03-4891-952b-de4fb85f47fd @@ -0,0 +1 @@ +(Multimodal Live API) Models supported All Gemini 2.0 and 2.5 models Only Flash experimental models File input types supported .png, .jpeg, .csv, .xml, .cpp, .java, .py, .js, .ts .png, .jpeg, .csv, .xml, .cpp, .java, .py, .js, .ts Plotting libraries supported Matplotlib, seaborn Matplotlib, seaborn Multi-tool use Yes (code execution + grounding only) Yes Billing There's no additional charge for enabling code execution from the Gemini API. You'll be billed at the current rate of input and output tokens based on the Gemini model you're using. Here are a few other things to know about billing for code execution: You're only billed once for the input tokens you pass to the model, and you're billed for the final output tokens returned to you by the model. Tokens representing generated code are counted as output tokens. Generated code can include text and multimodal output like images. Code execution results are also counted as output tokens. The billing model is shown in the following diagram: You're billed at the current rate of input and output tokens based on the Gemini model you're using. If Gemini uses code execution when generating your response, the original prompt, the generated code, and the result of the executed code are labeled intermediate tokens and are billed as input tokens . Gemini then generates a summary and returns the generated code, the result of the executed code, and the final summary. These are billed as output tokens . The Gemini API includes an intermediate token count in the API response, so you know why you're getting additional input tokens beyond your initial prompt. Limitations The model can only generate and execute code. It can't return other artifacts like media files. In some cases, enabling code execution can lead to regressions in other areas of model output (for example, writing a story). There is some variation in the ability of the different models to use code execution successfully. Supported libraries The code execution \ No newline at end of file diff --git a/docstore/0db8af4f-a9f5-4c1f-8d76-46dce5745639 b/docstore/0db8af4f-a9f5-4c1f-8d76-46dce5745639 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/0db8af4f-a9f5-4c1f-8d76-46dce5745639 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/0dc31b37-24d2-4b3c-b3a4-156b962e3191 b/docstore/0dc31b37-24d2-4b3c-b3a4-156b962e3191 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/0dc31b37-24d2-4b3c-b3a4-156b962e3191 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/0dc9be48-eda6-48f4-a2f8-7bbdef4bc91a b/docstore/0dc9be48-eda6-48f4-a2f8-7bbdef4bc91a new file mode 100644 index 0000000000000000000000000000000000000000..54ee11bfb756db29fb776eb5a6d4247407dfa205 --- /dev/null +++ b/docstore/0dc9be48-eda6-48f4-a2f8-7bbdef4bc91a @@ -0,0 +1 @@ +(Default/Some Thinking): Many common requests benefit from a degree of step-by-step processing or deeper understanding. Gemini can flexibly use thinking capability for tasks like: Analogize photosynthesis and growing up. Compare and contrast electric cars and hybrid cars. Hard Tasks (Maximum Thinking Capability): For truly complex challenges, such as solving complex math problems or coding tasks, we recommend setting a high thinking budget. These types of tasks require the model to engage its full reasoning and planning capabilities, often involving many internal steps before providing an answer. Examples include: Solve problem 1 in AIME 2025: Find the sum of all integer bases b > 9 for which 17 b is a divisor of 97 b . Write Python code for a web application that visualizes real-time stock market data, including user authentication. Make it as efficient as possible. Thinking with tools and capabilities Thinking models work with all of Gemini's tools and capabilities. This allows the models to interact with external systems, execute code, or access real-time information, incorporating the results into their reasoning and final response. The search tool allows the model to query Google Search to find up-to-date information or information beyond its training data. This is useful for questions about recent events or highly specific topics. The code execution tool enables the model to generate and run Python code to perform calculations, manipulate data, or solve problems that are best handled algorithmically. The model receives the code's output and can use it in its response. With structured output , you can constrain Gemini to respond with JSON. This is particularly useful for integrating the model's output into applications. Function calling connects the thinking model to external tools and APIs, so it can reason about when to call the right function and what parameters to provide. URL Context provides the model with URLs as additional context for your prompt. The \ No newline at end of file diff --git a/docstore/0dd01758-5b3e-4ed5-8e57-b89baf9e060d b/docstore/0dd01758-5b3e-4ed5-8e57-b89baf9e060d new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/0dd01758-5b3e-4ed5-8e57-b89baf9e060d @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/0dda501c-6e90-4c33-bb07-8e3debd5e49f b/docstore/0dda501c-6e90-4c33-bb07-8e3debd5e49f new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/0dda501c-6e90-4c33-bb07-8e3debd5e49f @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/0de2fe72-e85a-4653-bf87-5d9e2d24b3d3 b/docstore/0de2fe72-e85a-4653-bf87-5d9e2d24b3d3 new file mode 100644 index 0000000000000000000000000000000000000000..da790334610df2eb1eb6ac9bb30354d4f1eafe8b --- /dev/null +++ b/docstore/0de2fe72-e85a-4653-bf87-5d9e2d24b3d3 @@ -0,0 +1 @@ +the output you want. Adding these examples can help the model identify the patterns and apply the relationship between the given images and responses to the new example. This is also called "few-shot" learning. In the example below, the initial output is written in sentence form, and also contains the country (Brazil). Suppose you want the output in a different format or style, and you want only the city, not the country. Adding few-shot examples to your prompt can steer the model to respond in the way you want. Prompt Model response Determine the city along with the landmark. The landmark is the Christ the Redeemer statue in Rio de Janeiro, Brazil. Updated prompt Improved response Determine the city along with the landmark. city: Rome, landmark: the Colosseum. city: Beijing, landmark: Forbidden City city: Rio de Janeiro, landmark: Christ the Redeemer statue Break it down step-by-step For complex tasks like the ones that require both visual understanding and reasoning, it can be helpful to split the task into smaller, more straightforward steps. Alternatively, it could also be effective if you directly ask the model to “think step by step” in your prompt. Prompt Model response When will I run out of toilet paper? Soon, you only have 3 rolls left. Updated prompt Improved response 1. First, count how many toilet paper rolls are in this picture. 2. Then, determine how much toilet paper a typical person uses per day. 3. Calculate how long these rolls of toilet paper will last. 1. There are 3 rolls of toilet paper in this picture. 2. A typical person uses about 20 sheets of toilet paper per day. 3. If each roll contains 200 sheets, then each roll will last for about 10 days. Therefore, the 3 rolls will last for about a month. Math problems or other types of word problems are great candidates for asking the model to think step-by-step. Prompt Response What is the 4th term in the sequence? -135 The response from the model is incorrect. Some ways to improve this is to ask \ No newline at end of file diff --git a/docstore/0de53d1a-3d17-4d53-b67d-cef3951b1b5f b/docstore/0de53d1a-3d17-4d53-b67d-cef3951b1b5f new file mode 100644 index 0000000000000000000000000000000000000000..43d235dee43f472c269052714c9705874463b9cd --- /dev/null +++ b/docstore/0de53d1a-3d17-4d53-b67d-cef3951b1b5f @@ -0,0 +1 @@ +tools ]) # Define user prompt contents = [ types . Content ( role = "user" , parts = [ types . Part ( text = "Turn the lights down to a romantic level" )] ) ] # Send request with function declarations response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = contents config = config , ) print ( response . candidates [ 0 ] . content . parts [ 0 ] . function_call ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Generation config with function declaration const config = { tools : [{ functionDeclarations : [ setLightValuesFunctionDeclaration ] }] }; // Configure the client const ai = new GoogleGenAI ({}); // Define user prompt const contents = [ { role : 'user' , parts : [{ text : 'Turn the lights down to a romantic level' }] } ]; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( response . functionCalls [ 0 ]); The model then returns a functionCall object in an OpenAPI compatible schema specifying how to call one or more of the declared functions in order to respond to the user's question. Python id = None args = { 'color_temp' : 'warm' , 'brightness' : 25 } name = 'set_light_values' JavaScript { name : 'set_light_values' , args : { brightness : 25 , color_temp : 'warm' } } Step 3: Execute set_light_values function code Extract the function call details from the model's response, parse the arguments , and execute the set_light_values function. Python # Extract tool call details, it may not be in the first part. tool_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call if tool_call . name == "set_light_values" : result = set_light_values ( ** tool_call . args ) print ( f "Function execution result: { result } " ) JavaScript // Extract tool call details const tool_call = response . functionCalls [ 0 ] let result ; if ( tool_call . name === 'set_light_values' ) { result = \ No newline at end of file diff --git a/docstore/0e1ea2d6-82c9-4798-93a0-bb468db016a4 b/docstore/0e1ea2d6-82c9-4798-93a0-bb468db016a4 new file mode 100644 index 0000000000000000000000000000000000000000..a97f8e9b4e6ce1d85f8c4b9bed31a9676bb7c602 --- /dev/null +++ b/docstore/0e1ea2d6-82c9-4798-93a0-bb468db016a4 @@ -0,0 +1 @@ +Files API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Files API The Gemini family of artificial intelligence (AI) models is built to handle various types of input data, including text, images, and audio. Since these models can handle more than one type or mode of data, the Gemini models are called multimodal models or explained as having multimodal capabilities . This guide shows you how to work with media files using the Files API. The basic operations are the same for audio files, images, videos, documents, and other supported file types. For file prompting guidance, check out the File prompt guide section. Upload a file You can use the Files API to upload a media file. Always use the Files API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20 MB. The following code uploads a file and then uses the file in a call to generateContent . Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp3" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ "Describe this audio clip" , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Describe this audio \ No newline at end of file diff --git a/docstore/0e2df469-e8b2-4e7e-aea9-93eb072bd272 b/docstore/0e2df469-e8b2-4e7e-aea9-93eb072bd272 new file mode 100644 index 0000000000000000000000000000000000000000..9670cc28c51e28ab1129c38367a73e00f33ff60c --- /dev/null +++ b/docstore/0e2df469-e8b2-4e7e-aea9-93eb072bd272 @@ -0,0 +1 @@ +meal features brown rice, roasted vegetables, and chicken teriyaki. The brown rice is a healthy and complex carbohydrate that will give you sustained energy. The roasted vegetables are a great way to get your daily dose of vitamins and minerals, and the chicken teriyaki is a delicious and protein-rich option. This meal is also very easy to prepare. Simply cook the brown rice, roast the vegetables, and cook the chicken teriyaki. Once everything is cooked, divide it into meal prep containers and store them in the refrigerator. You can then grab a container and go on busy mornings! If you're looking for a healthy and delicious meal that can be easily prepped ahead of time, this meal is a great option. It's packed with nutrients and flavor, and it's sure to keep you feeling full and satisfied. Here's to healthy and delicious meal prepping! If you are having trouble getting the output you want from prompts that use media files, there are some strategies that can help you get the results you want. The following sections provide design approaches and troubleshooting tips for improving prompts that use multimodal input. You can improve your multimodal prompts by following these best practices: Prompt design fundamentals Be specific in your instructions : Craft clear and concise instructions that leave minimal room for misinterpretation. Add a few examples to your prompt: Use realistic few-shot examples to illustrate what you want to achieve. Break it down step-by-step : Divide complex tasks into manageable sub-goals, guiding the model through the process. Specify the output format : In your prompt, ask for the output to be in the format you want, like markdown, JSON, HTML and more. Put your image first for single-image prompts : While Gemini can handle image and text inputs in any order, for prompts containing a single image, it might perform better if that image (or video) is placed before the text prompt. However, for prompts that require images to be highly interleaved \ No newline at end of file diff --git a/docstore/0e34bfa6-5d0f-459e-ac2a-20a1be6f829b b/docstore/0e34bfa6-5d0f-459e-ac2a-20a1be6f829b new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/0e34bfa6-5d0f-459e-ac2a-20a1be6f829b @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/0e3f042f-e6db-426a-b2a6-23a9fc21f73a b/docstore/0e3f042f-e6db-426a-b2a6-23a9fc21f73a new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/0e3f042f-e6db-426a-b2a6-23a9fc21f73a @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/0e447037-f33e-42e2-97d1-e90b24fb8bc3 b/docstore/0e447037-f33e-42e2-97d1-e90b24fb8bc3 new file mode 100644 index 0000000000000000000000000000000000000000..48ebc0d450e476e2d2310fffefae223b737ab72c --- /dev/null +++ b/docstore/0e447037-f33e-42e2-97d1-e90b24fb8bc3 @@ -0,0 +1 @@ +Because standard Gemini API text and content generation calls are stateless, when using thinking in multi-turn interactions (such as chat), the model doesn't have access to thought context from previous turns. You can maintain thought context using thought signatures, which are encrypted representations of the model's internal thought process. The model returns thought signatures in the response object when thinking and function calling are enabled. To ensure the model maintains context across multiple turns of a conversation, you must provide the thought signatures back to the model in the subsequent requests. You will receive thought signatures when: Thinking is enabled and thoughts are generated. The request includes function declarations . Note: Thought signatures are only available when you're using function calling, specifically, your request must include function declarations . You can find an example of thinking with function calls on the Function calling page. Other usage limitations to consider with function calling include: Signatures are returned from the model within other parts in the response, for example function calling or text parts. Return the entire response with all parts back to the model in subsequent turns. Don't concatenate parts with signatures together. Don't merge one part with a signature with another part without a signature. Pricing Note: Summaries are available in the free and paid tiers of the API. Thought signatures will increase the input tokens you are charged when sent back as part of the request. When thinking is turned on, response pricing is the sum of output tokens and thinking tokens. You can get the total number of generated thinking tokens from the thoughtsTokenCount field. Python # ... print ( "Thoughts tokens:" , response . usage_metadata . thoughts_token_count ) print ( "Output tokens:" , response . usage_metadata . candidates_token_count ) JavaScript // ... console . log ( `Thoughts tokens: ${ response . usageMetadata \ No newline at end of file diff --git a/docstore/0e671232-533b-4eec-9155-d4a6905b700d b/docstore/0e671232-533b-4eec-9155-d4a6905b700d new file mode 100644 index 0000000000000000000000000000000000000000..433635003046509e85b7917fbaa1cad75744aec9 --- /dev/null +++ b/docstore/0e671232-533b-4eec-9155-d4a6905b700d @@ -0,0 +1 @@ +GenerateContentRequest inline_requests = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Tell me a one-sentence joke.' }], 'role' : 'user' }] }, { 'contents' : [{ 'parts' : [{ 'text' : 'Why is the sky blue?' }], 'role' : 'user' }] } ] inline_batch_job = client . batches . create ( model = "models/gemini-2.5-flash" , src = inline_requests , config = { 'display_name' : "inlined-requests-job-1" , }, ) print ( f "Created batch job: { inline_batch_job . name } " ) REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:batchGenerateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -X POST \ -H "Content-Type:application/json" \ -d '{ "batch": { "display_name": "my-batch-requests", "input_config": { "requests": { "requests": [ { "request": {"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}]}, "metadata": { "key": "request-1" } }, { "request": {"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}]}, "metadata": { "key": "request-2" } } ] } } } }' You can use any requests you would use in non-batch (or interactive) mode. For example, you could specify the temperature, system instructions or even pass in other modalities. The following example shows some example inline requests that contain a system instruction for one of the requests: inline_requests_list = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Write a short poem about a cloud.' }]}]}, { 'contents' : [{ 'parts' : [{ 'text' : 'Write a short poem about a cat.' }]}], 'system_instructions' : { 'parts' : [{ 'text' : 'You are a cat. Your name is Neko.' }]}} ] Similarly can also specify tools to use for a request. The following example shows a request that enables the Google Search tool : inline_requests_list = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Who won the euro 1998?' }]}]}, { 'contents' : [{ 'parts' : [{ 'text' : 'Who won the euro 2025?' }]}], 'tools' : [{ 'google_search ' : {}}]} ] Input file For larger sets of requests, prepare a JSON Lines \ No newline at end of file diff --git a/docstore/0e91c8a8-576b-4872-9104-aca40b2cbdb6 b/docstore/0e91c8a8-576b-4872-9104-aca40b2cbdb6 new file mode 100644 index 0000000000000000000000000000000000000000..6e70ea340054564b3fe69259322fecece1f89448 --- /dev/null +++ b/docstore/0e91c8a8-576b-4872-9104-aca40b2cbdb6 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/video#limitations Title: Generate video using Veo | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/0ea22f78-d5eb-405f-a3c3-a7222d0e174b b/docstore/0ea22f78-d5eb-405f-a3c3-a7222d0e174b new file mode 100644 index 0000000000000000000000000000000000000000..045707d455060dfd20be0644c14272aa57ff277b --- /dev/null +++ b/docstore/0ea22f78-d5eb-405f-a3c3-a7222d0e174b @@ -0,0 +1 @@ +"log" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } prompt := "Explain the concept of Occam's Razor and provide a simple, everyday example." model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , genai . Text ( prompt ), nil ) fmt . Println ( resp . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain the concept of Occam\' s Razor and provide a simple, everyday example. " } ] } ] }' ``` Thinking budgets The thinkingBudget parameter guides the model on the number of thinking tokens to use when generating a response. A higher token count generally allows for more detailed reasoning, which can be beneficial for tackling more complex tasks . If latency is more important, use a lower budget or disable thinking by setting thinkingBudget to 0. Setting the thinkingBudget to -1 turns on dynamic thinking , meaning the model will adjust the budget based on the complexity of the request. The thinkingBudget is only supported in Gemini 2.5 Flash, 2.5 Pro, and 2.5 Flash-Lite. Depending on the prompt, the model might overflow or underflow the token budget. The following are thinkingBudget configuration details for each model type. Model Default setting (Thinking budget is not set) Range Disable thinking Turn on dynamic thinking 2.5 Pro Dynamic thinking: Model decides when and how much to think 128 to 32768 N/A: Cannot disable thinking thinkingBudget = -1 2.5 Flash Dynamic thinking: Model decides when and how much to think 0 to 24576 thinkingBudget = 0 thinkingBudget = -1 2.5 Flash Lite Model does not think 512 to 24576 thinkingBudget = 0 thinkingBudget = -1 Python from google import genai from google.genai import types client = genai . \ No newline at end of file diff --git a/docstore/0eda6c8f-964d-408d-8f85-d3955dee038f b/docstore/0eda6c8f-964d-408d-8f85-d3955dee038f new file mode 100644 index 0000000000000000000000000000000000000000..33a8b238b28b3b4e6fb2252f6f1e5e7807510cc2 --- /dev/null +++ b/docstore/0eda6c8f-964d-408d-8f85-d3955dee038f @@ -0,0 +1 @@ +used to create the audio response: Native audio : This option provides the most natural and realistic-sounding speech and better multilingual performance. It also enables advanced features like affective (emotion-aware) dialogue , proactive audio (where the model can decide to ignore or respond to certain inputs), and "thinking" . Native audio is supported by the following native audio models : gemini-2.5-flash-preview-native-audio-dialog gemini-2.5-flash-exp-native-audio-thinking-dialog Half-cascade audio : This option uses a cascaded model architecture (native audio input and text-to-speech output). It offers better performance and reliability in production environments, especially with tool use . Half-cascaded audio is supported by the following models: gemini-live-2.5-flash-preview gemini-2.0-flash-live-001 Choose an implementation approach When integrating with Live API, you'll need to choose one of the following implementation approaches: Server-to-server : Your backend connects to the Live API using WebSockets . Typically, your client sends stream data (audio, video, text) to your server, which then forwards it to the Live API. Client-to-server : Your frontend code connects directly to the Live API using WebSockets to stream data, bypassing your backend. Note: Client-to-server generally offers better performance for streaming audio and video, since it bypasses the need to send the stream to your backend first. It's also easier to set up since you don't need to implement a proxy that sends data from your client to your server and then your server to the API. However, for production environments, in order to mitigate security risks, we recommend using ephemeral tokens instead of standard API keys. Get started This example reads a WAV file , sends it in the correct format, and saves the received data as WAV file. You can send audio by converting it to 16-bit PCM, 16kHz, mono format, and you can receive audio by setting AUDIO as response modality. The output uses \ No newline at end of file diff --git a/docstore/0ee4c98d-964a-4a07-8f74-b2d929f43b33 b/docstore/0ee4c98d-964a-4a07-8f74-b2d929f43b33 new file mode 100644 index 0000000000000000000000000000000000000000..41dedb01cb0b9c984f39578d0001dc7776e6fe12 --- /dev/null +++ b/docstore/0ee4c98d-964a-4a07-8f74-b2d929f43b33 @@ -0,0 +1 @@ +, 'IMAGE' ] ) ) for part in response . candidates [ 0 ] . content . parts : if part . text is not None : print ( part . text ) elif part . inline_data is not None : image = Image . open ( BytesIO (( part . inline_data . data ))) image . show () JavaScript Note: We've released the Google SDK for TypeScript and JavaScript in preview launch stage . Use this SDK for image generation features. import { GoogleGenAI , Modality } from "@google/genai" ; import * as fs from "node:fs" ; async function main () { const ai = new GoogleGenAI ({}); // Load the image from the local file system const imagePath = "path/to/image.png" ; const imageData = fs . readFileSync ( imagePath ); const base64Image = imageData . toString ( "base64" ); // Prepare the content parts const contents = [ { text : "Can you add a llama next to the image?" }, { inlineData : { mimeType : "image/png" , data : base64Image , }, }, ]; // Set responseModalities to include "Image" so the model can generate an image const response = await ai . models . generateContent ({ model : "gemini-2.0-flash-preview-image-generation" , contents : contents , config : { responseModalities : [ Modality . TEXT , Modality . IMAGE ], }, }); for ( const part of response . candidates [ 0 ]. content . parts ) { // Based on the part type, either show the text or save the image if ( part . text ) { console . log ( part . text ); } else if ( part . inlineData ) { const imageData = part . inlineData . data ; const buffer = Buffer . from ( imageData , "base64" ); fs . writeFileSync ( "gemini-native-image.png" , buffer ); console . log ( "Image saved as gemini-native-image.png" ); } } } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imagePath := "/path/to/image.png" imgData , _ := os . ReadFile ( imagePath ) parts := [] * genai . Part { genai . NewPartFromText ( "Hi, This is \ No newline at end of file diff --git a/docstore/0ee5b45d-36f4-4162-96cc-6c0dda04c79b b/docstore/0ee5b45d-36f4-4162-96cc-6c0dda04c79b new file mode 100644 index 0000000000000000000000000000000000000000..0d39ec227c9e73c3d3b5117917b80f593b500b4b --- /dev/null +++ b/docstore/0ee5b45d-36f4-4162-96cc-6c0dda04c79b @@ -0,0 +1 @@ +async function main () { const ai = new GoogleGenAI ({}); const imageUrl = "https://goo.gle/instrument-img" ; const response = await fetch ( imageUrl ); const imageArrayBuffer = await response . arrayBuffer (); const base64ImageData = Buffer . from ( imageArrayBuffer ). toString ( 'base64' ); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ { inlineData : { mimeType : 'image/jpeg' , data : base64ImageData , }, }, { text : "Caption this image." } ], }); console . log ( result . text ); } main (); Go package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Download the image. imageResp , _ := http . Get ( "https://goo.gle/instrument-img" ) imageBytes , _ := io . ReadAll ( imageResp . Body ) parts := [] * genai . Part { genai . NewPartFromBytes ( imageBytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMG_URL = "https://goo.gle/instrument-img" MIME_TYPE = $( curl -sIL " $IMG_URL " | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1 ) if [[ -z " $MIME_TYPE " || ! " $MIME_TYPE " == image/* ]] ; then MIME_TYPE = "image/jpeg" fi # Check for macOS if [[ " $( uname ) " == "Darwin" ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -b 0 ) elif [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 ) else IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -w0 ) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ \ No newline at end of file diff --git a/docstore/0efd0e0c-1882-4e12-b9c9-739206ef6f4f b/docstore/0efd0e0c-1882-4e12-b9c9-739206ef6f4f new file mode 100644 index 0000000000000000000000000000000000000000..b58c189db2b124c61872134bdd4b3d786a4e18e6 --- /dev/null +++ b/docstore/0efd0e0c-1882-4e12-b9c9-739206ef6f4f @@ -0,0 +1 @@ +you can read about configuring function calling . Python from google import genai from google.genai import types # Configure the client and tools client = genai . Client () house_tools = [ types . Tool ( function_declarations = [ power_disco_ball , start_music , dim_lights ]) ] config = types . GenerateContentConfig ( tools = house_tools , automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ), # Force the model to call 'any' function, instead of chatting. tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = 'ANY' ) ), ) chat = client . chats . create ( model = "gemini-2.5-flash" , config = config ) response = chat . send_message ( "Turn this place into a party!" ) # Print out each of the function calls requested from this single call print ( "Example 1: Forced function calling" ) for fn in response . function_calls : args = ", " . join ( f " { key } = { val } " for key , val in fn . args . items ()) print ( f " { fn . name } ( { args } )" ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Set up function declarations const houseFns = [ powerDiscoBall , startMusic , dimLights ]; const config = { tools : [{ functionDeclarations : houseFns }], // Force the model to call 'any' function, instead of chatting. toolConfig : { functionCallingConfig : { mode : 'any' } } }; // Configure the client const ai = new GoogleGenAI ({}); // Create a chat session const chat = ai . chats . create ({ model : 'gemini-2.5-flash' , config : config }); const response = await chat . sendMessage ({ message : 'Turn this place into a party!' }); // Print out each of the function calls requested from this single call console . log ( "Example 1: Forced function calling" ); for ( const fn of response . functionCalls ) { const args = Object . entries ( fn . args ) . map (([ key , val ]) = > ` ${ key } = ${ val } ` ) . join ( ', ' ); console . log ( ` ${ fn . name } ( ${ args } )` ); } Each of the printed \ No newline at end of file diff --git a/docstore/0f02231c-d50b-4700-abe1-6d06820c64e5 b/docstore/0f02231c-d50b-4700-abe1-6d06820c64e5 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/0f02231c-d50b-4700-abe1-6d06820c64e5 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/0f056cab-7392-4727-b5f9-611160ed37b5 b/docstore/0f056cab-7392-4727-b5f9-611160ed37b5 new file mode 100644 index 0000000000000000000000000000000000000000..c3dd210d4957ccdbb55df147cb99efb49a9932b2 --- /dev/null +++ b/docstore/0f056cab-7392-4727-b5f9-611160ed37b5 @@ -0,0 +1 @@ +tokens Context caching price Not available $0.3125, prompts <= 128k tokens $0.625, prompts > 128k tokens Context caching (storage) Not available $4.50 per hour Tuning price Not available Not available Grounding with Google Search Not available $35 / 1K grounding requests Used to improve our products Yes No Text Embedding 004 Our state-of-the-art text embedding model. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge Not available Output price Free of charge Not available Tuning price Not available Not available Used to improve our products Yes No [*] Google AI Studio usage is free of charge in all available regions . See Billing FAQs for details. [**] Prices may differ from the prices listed here and the prices offered on Vertex AI. For Vertex prices, see the Vertex AI pricing page . [***] If you are using dynamic retrieval to optimize costs, only requests that contain at least one grounding support URL from the web in their response are charged for Grounding with Google Search. Costs for Gemini always apply. Rate limits are subject to change. Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/0f1f12e0-9ff3-44fd-abbf-6f4774c3a217 b/docstore/0f1f12e0-9ff3-44fd-abbf-6f4774c3a217 new file mode 100644 index 0000000000000000000000000000000000000000..a2cbfdbb7443e27cd6ac9a7593dc5fc212a17b87 --- /dev/null +++ b/docstore/0f1f12e0-9ff3-44fd-abbf-6f4774c3a217 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-2.0-flash-lite Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/0f396bc1-edd5-43e7-ab6c-724b4aa50c9c b/docstore/0f396bc1-edd5-43e7-ab6c-724b4aa50c9c new file mode 100644 index 0000000000000000000000000000000000000000..e8472db3f1b37b3dc2d13dd06406d0e42fc75dfb --- /dev/null +++ b/docstore/0f396bc1-edd5-43e7-ab6c-724b4aa50c9c @@ -0,0 +1 @@ +costing 258 tokens. Tips and best practices Verify that images are correctly rotated. Use clear, non-blurry images. When using a single image with text, place the text prompt after the image part in the contents array. What's next This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources: Files API : Learn more about uploading and managing files for use with Gemini. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/0f5149db-d7a9-487f-b08a-cfe87283d14a b/docstore/0f5149db-d7a9-487f-b08a-cfe87283d14a new file mode 100644 index 0000000000000000000000000000000000000000..eb9021b0c62d8225572ffa88a8581263b0a8ad78 --- /dev/null +++ b/docstore/0f5149db-d7a9-487f-b08a-cfe87283d14a @@ -0,0 +1 @@ +and their capabilities, visit the Models page. Best practices Prompting tips For basic text generation, a zero-shot prompt often suffices without needing examples, system instructions or specific formatting. For more tailored outputs: Use System instructions to guide the model. Provide few example inputs and outputs to guide the model. This is often referred to as few-shot prompting. Consult our prompt engineering guide for more tips. Structured output In some cases, you may need structured output, such as JSON. Refer to our structured output guide to learn how. What's next Try the Gemini API getting started Colab . Explore Gemini's image , video , audio and document understanding capabilities. Learn about multimodal file prompting strategies . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/0f6c2199-49ac-4d62-ab76-d873a2fdbf8d b/docstore/0f6c2199-49ac-4d62-ab76-d873a2fdbf8d new file mode 100644 index 0000000000000000000000000000000000000000..964387d039ef80cfbc57b203b39f125290e566d4 --- /dev/null +++ b/docstore/0f6c2199-49ac-4d62-ab76-d873a2fdbf8d @@ -0,0 +1 @@ +using the text below. Respond with only the text provided. Question: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Text: Color: Slowly pulsing yellow What it means: There is a network error. What to do: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. Color: Fast blinking yellow What it means: You are holding down the reset button and are factory resetting this device. What to do: If you keep holding down the reset button, after about 12 seconds, the light will turn solid yellow. Once it is solid yellow, let go of the factory reset button. Color: Solid yellow What it means: Router is factory resetting. What to do: This can take up to 10 minutes. When it's done, the device will reset itself and start pulsing white, letting you know it's ready for setup. Color: Solid red What it means: Something is wrong. What to do: Critical failure. Factory reset the router. If the light stays red, contact Wifi customer support. Response: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. (gemini-2.5-flash) Add prefixes A prefix is a word or phrase that you add to the prompt content that can serve several purposes, depending on where you put the prefix: Input prefix: Adding a prefix to the input signals semantically meaningful parts of the input to the model. For example, the prefixes "English:" and "French:" demarcate two different languages. Output prefix: Even though the output is generated by the model, you can add a prefix for the output in the prompt. The output prefix gives the model information about what's expected as a response. For example, the output prefix "JSON:" signals to the model that the output should be in JSON format. Example prefix: In few-shot prompts, adding prefixes \ No newline at end of file diff --git a/docstore/0f6c2212-7a84-46df-8e04-961b7ba91108 b/docstore/0f6c2212-7a84-46df-8e04-961b7ba91108 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/0f6c2212-7a84-46df-8e04-961b7ba91108 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/0f6e71c8-e7d7-4dac-be0f-bdaa7860db9e b/docstore/0f6e71c8-e7d7-4dac-be0f-bdaa7860db9e new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/0f6e71c8-e7d7-4dac-be0f-bdaa7860db9e @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/0f809098-0ac4-458d-b730-85b54933c351 b/docstore/0f809098-0ac4-458d-b730-85b54933c351 new file mode 100644 index 0000000000000000000000000000000000000000..8b1657bd0e8a588bad85661db832fbb0d660fdd6 --- /dev/null +++ b/docstore/0f809098-0ac4-458d-b730-85b54933c351 @@ -0,0 +1 @@ +serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Incremental content updates Use incremental updates to send text input, establish session context, or restore session context. For short contexts you can send turn-by-turn interactions to represent the exact sequence of events: Python turns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }]}, { "role" : "model" , "parts" : [{ "text" : "Paris" }]}, ] await session . send_client_content ( turns = turns , turn_complete = False ) turns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }]}] await session . send_client_content ( turns = turns , turn_complete = True ) JavaScript let inputTurns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }] }, { "role" : "model" , "parts" : [{ "text" : "Paris" }] }, ] session . sendClientContent ({ turns : inputTurns , turnComplete : false }) inputTurns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }] }] session . sendClientContent ({ turns : inputTurns , turnComplete : true }) For longer contexts \ No newline at end of file diff --git a/docstore/0f80bd14-c555-4a89-9e54-16810cfb6243 b/docstore/0f80bd14-c555-4a89-9e54-16810cfb6243 new file mode 100644 index 0000000000000000000000000000000000000000..facd4f50718f089582ed06a7b9e2ea144ce56d9c --- /dev/null +++ b/docstore/0f80bd14-c555-4a89-9e54-16810cfb6243 @@ -0,0 +1 @@ +instructions, such as: "Show bounding boxes of all green objects in this image". It also support custom labels like "label the items with the allergens they can contain". For more examples, check following notebooks in the Gemini Cookbook : 2D spatial understanding notebook Experimental 3D pointing notebook Segmentation Starting with Gemini 2.5, models not only detect items but also segment them and provide their contour masks. The model predicts a JSON list, where each item represents a segmentation mask. Each item has a bounding box (" box_2d ") in the format [y0, x0, y1, x1] with normalized coordinates between 0 and 1000, a label (" label ") that identifies the object, and finally the segmentation mask inside the bounding box, as base64 encoded png that is a probability map with values between 0 and 255. The mask needs to be resized to match the bounding box dimensions, then binarized at your confidence threshold (127 for the midpoint). Note: For better results, disable thinking by setting the thinking budget to 0. See code sample below for an example. Python from google import genai from google.genai import types from PIL import Image , ImageDraw import io import base64 import json import numpy as np import os client = genai . Client () def parse_json ( json_output : str ): # Parsing out the markdown fencing lines = json_output . splitlines () for i , line in enumerate ( lines ): if line == "```json" : json_output = " \n " . join ( lines [ i + 1 :]) # Remove everything before "```json" output = json_output . split ( "```" )[ 0 ] # Remove everything after the closing "```" break # Exit the loop once "```json" is found return json_output def extract_segmentation_masks ( image_path : str , output_dir : str = "segmentation_outputs" ): # Load and resize image im = Image . open ( image_path ) im . thumbnail ([ 1024 , 1024 ], Image . Resampling . LANCZOS ) prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks \ No newline at end of file diff --git a/docstore/0f8f74bf-68d3-49a5-a735-7573555f5ab4 b/docstore/0f8f74bf-68d3-49a5-a735-7573555f5ab4 new file mode 100644 index 0000000000000000000000000000000000000000..53d3e426b4ff6e145f59bebdb86773397956de24 --- /dev/null +++ b/docstore/0f8f74bf-68d3-49a5-a735-7573555f5ab4 @@ -0,0 +1 @@ +field responseJsonSchema which accepts any JSON Schema with the following limitations: It only works with Gemini 2.5. While all JSON Schema properties can be passed, not all are supported. See the documentation for the field for more details. Recursive references can only be used as the value of a non-required object property. Recursive references are unrolled to a finite degree, based on the size of the schema. Schemas that contain $ref cannot contain any properties other than those starting with a $ . Here's an example of generating a JSON Schema with Pydantic and submitting it to the model: curl "https://generativelanguage.googleapis.com/v1alpha/models/\ gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d @- < = y1 or x0 > = x1 : continue # Process mask png_str = item [ "mask" ] if not png_str . startswith ( "data:image/png;base64," ): continue # Remove prefix png_str = png_str . removeprefix ( "data:image/png;base64," ) mask_data = base64 . b64decode ( png_str ) mask = Image . open ( io . BytesIO ( mask_data )) # Resize mask to match bounding box mask = mask . resize (( x1 - x0 , y1 - y0 ), Image . Resampling . BILINEAR ) # Convert mask to numpy array for processing mask_array = np . array ( mask ) # Create overlay for this mask overlay = Image . new ( 'RGBA' , im . size , ( 0 , 0 , 0 , 0 )) overlay_draw = ImageDraw . Draw ( overlay ) # Create overlay for the mask color = ( 255 , 255 , 255 , 200 ) for y in range ( y0 , y1 ): for x in range ( x0 , x1 ): if mask_array [ y - y0 , x - x0 ] > 128 : # Threshold for mask overlay_draw . point (( x , y ), fill = color ) # Save individual mask and its overlay mask_filename = f " { item [ 'label' ] } _ { i } _mask.png" \ No newline at end of file diff --git a/docstore/0fedbf56-8db7-4e2e-b6c9-0aec1405e211 b/docstore/0fedbf56-8db7-4e2e-b6c9-0aec1405e211 new file mode 100644 index 0000000000000000000000000000000000000000..4a588ba82271aaa08f2df12295049c23bd57f7eb --- /dev/null +++ b/docstore/0fedbf56-8db7-4e2e-b6c9-0aec1405e211 @@ -0,0 +1 @@ +, ) print ( response . text ) # The SDK handles the function call and returns the final text You can disable automatic function calling with: Python config = types . GenerateContentConfig ( tools = [ get_current_temperature ], automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ) ) Automatic function schema declaration Automatic schema extraction from Python functions doesn't work in all cases. For example, it doesn't handle cases where you describe the fields of a nested dictionary-object. The API is able to describe any of the following types: Python AllowedType = ( int | float | bool | str | list [ 'AllowedType' ] | dict [ str , AllowedType ]) To see what the inferred schema looks like, you can convert it using from_callable : Python def multiply ( a : float , b : float ): """Returns a * b.""" return a * b fn_decl = types . FunctionDeclaration . from_callable ( callable = multiply , client = client ) # to_json_dict() provides a clean JSON representation. print ( fn_decl . to_json_dict ()) Multi-tool use: Combine native tools with function calling You can enable multiple tools combining native tools with function calling at the same time. Here's an example that enables two tools, Grounding with Google Search and code execution , in a request using the Live API . Note: Multi-tool use is a- Live API only feature at the moment. The run() function declaration, which handles the asynchronous websocket setup, is omitted for brevity. Python # Multiple tasks example - combining lights, code execution, and search prompt = """ Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! """ tools = [ { 'google_search' : {}}, { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} # not defined here. \ No newline at end of file diff --git a/docstore/101772ea-cbc8-4325-8e4b-0d004cd22099 b/docstore/101772ea-cbc8-4325-8e4b-0d004cd22099 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/101772ea-cbc8-4325-8e4b-0d004cd22099 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/10206f79-76fe-464f-8d0d-4a6862c6d380 b/docstore/10206f79-76fe-464f-8d0d-4a6862c6d380 new file mode 100644 index 0000000000000000000000000000000000000000..964387d039ef80cfbc57b203b39f125290e566d4 --- /dev/null +++ b/docstore/10206f79-76fe-464f-8d0d-4a6862c6d380 @@ -0,0 +1 @@ +using the text below. Respond with only the text provided. Question: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Text: Color: Slowly pulsing yellow What it means: There is a network error. What to do: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. Color: Fast blinking yellow What it means: You are holding down the reset button and are factory resetting this device. What to do: If you keep holding down the reset button, after about 12 seconds, the light will turn solid yellow. Once it is solid yellow, let go of the factory reset button. Color: Solid yellow What it means: Router is factory resetting. What to do: This can take up to 10 minutes. When it's done, the device will reset itself and start pulsing white, letting you know it's ready for setup. Color: Solid red What it means: Something is wrong. What to do: Critical failure. Factory reset the router. If the light stays red, contact Wifi customer support. Response: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. (gemini-2.5-flash) Add prefixes A prefix is a word or phrase that you add to the prompt content that can serve several purposes, depending on where you put the prefix: Input prefix: Adding a prefix to the input signals semantically meaningful parts of the input to the model. For example, the prefixes "English:" and "French:" demarcate two different languages. Output prefix: Even though the output is generated by the model, you can add a prefix for the output in the prompt. The output prefix gives the model information about what's expected as a response. For example, the output prefix "JSON:" signals to the model that the output should be in JSON format. Example prefix: In few-shot prompts, adding prefixes \ No newline at end of file diff --git a/docstore/1032977d-3428-4cd0-b04b-bf46e626eafd b/docstore/1032977d-3428-4cd0-b04b-bf46e626eafd new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/1032977d-3428-4cd0-b04b-bf46e626eafd @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/10501d48-451d-4d6b-ab94-51e09f616119 b/docstore/10501d48-451d-4d6b-ab94-51e09f616119 new file mode 100644 index 0000000000000000000000000000000000000000..18a1ff5c556d92393fcec7796852542aeb203d6c --- /dev/null +++ b/docstore/10501d48-451d-4d6b-ab94-51e09f616119 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models#gemini-2.5-pro Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/10843ab4-d23f-4a0c-a24a-6b46ac0db178 b/docstore/10843ab4-d23f-4a0c-a24a-6b46ac0db178 new file mode 100644 index 0000000000000000000000000000000000000000..fd0551506f8bbd5887cf64565b1e4a5e868fecf1 --- /dev/null +++ b/docstore/10843ab4-d23f-4a0c-a24a-6b46ac0db178 @@ -0,0 +1 @@ +learn more about the latest YOUR_subject . Code examples with URL context only Python from google import genai from google.genai.types import Tool , GenerateContentConfig , GoogleSearch client = genai . Client () model_id = "gemini-2.5-flash" url_context_tool = Tool ( url_context = types . UrlContext ) response = client . models . generate_content ( model = model_id , contents = "Compare recipes from YOUR_URL1 and YOUR_URL2 " , config = GenerateContentConfig ( tools = [ url_context_tool ], response_modalities = [ "TEXT" ], ) ) for each in response . candidates [ 0 ] . content . parts : print ( each . text ) # get URLs retrieved for context print ( response . candidates [ 0 ] . url_context_metadata ) Javascript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ "Compare recipes from YOUR_URL1 and YOUR_URL2 " , ], config : { tools : [{ urlContext : {}}], }, }); console . log ( response . text ); // To get URLs retrieved for context console . log ( response . candidates [ 0 ]. urlContextMetadata ) } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -d '{ "contents": [ { "parts": [ {"text": "Compare recipes from YOUR_URL1 and YOUR_URL2 "} ] } ], "tools": [ { "url_context": {} } ] }' > result.json cat result.json Code examples with Grounding with Google Search Python from google import genai from google.genai.types import Tool , GenerateContentConfig , GoogleSearch client = genai . Client () model_id = "gemini-2.5-flash" tools = [] tools . append ( Tool ( url_context = types . UrlContext )) tools . append ( Tool ( google_search = types . GoogleSearch )) response = client . models . generate_content ( model = model_id , contents = "Give me three day events schedule based on \ No newline at end of file diff --git a/docstore/10950273-59c3-4e77-b522-1cea76ceff61 b/docstore/10950273-59c3-4e77-b522-1cea76ceff61 new file mode 100644 index 0000000000000000000000000000000000000000..9dd2717ee06a5cb6666e2976e86c3492640fe1f5 --- /dev/null +++ b/docstore/10950273-59c3-4e77-b522-1cea76ceff61 @@ -0,0 +1 @@ +'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Refer to the GenerateContentConfig in our API reference for a complete list of configurable parameters and their descriptions. Multimodal inputs The Gemini API supports multimodal inputs, allowing you to combine text with media files. The following example demonstrates providing an image: Python from PIL import Image from google import genai client = genai . Client () image = Image . open ( "/path/to/organ.png" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , "Tell me about this instrument" ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const image = await ai . files . upload ({ file : "/path/to/organ.png" , }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ createUserContent ([ "Tell me about this instrument" , createPartFromUri ( image . uri , image . mimeType ), ]), ], }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imagePath := "/path/to/organ.jpg" imgData , _ := os . ReadFile ( imagePath ) parts := [] * genai . Part { genai . NewPartFromText ( "Tell me about this instrument" ), & genai . Part { InlineData : & genai . Blob { MIMEType : "image/jpeg" , Data : imgData , \ No newline at end of file diff --git a/docstore/10b1f57e-af72-456c-92b0-b6f6f01e8e86 b/docstore/10b1f57e-af72-456c-92b0-b6f6f01e8e86 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/10b1f57e-af72-456c-92b0-b6f6f01e8e86 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/10b49829-3abf-46cd-8cf4-3bc4112f9924 b/docstore/10b49829-3abf-46cd-8cf4-3bc4112f9924 new file mode 100644 index 0000000000000000000000000000000000000000..ec6cba9f5d0ceb3b74c56797939372d30da827c9 --- /dev/null +++ b/docstore/10b49829-3abf-46cd-8cf4-3bc4112f9924 @@ -0,0 +1 @@ += "gemini-2.0-flash" , contents = """Generate a short transcript around 100 words that reads like it was clipped from a podcast by excited herpetologists. The hosts names are Dr. Anya and Liam.""" ) . text response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = transcript , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( multi_speaker_voice_config = types . MultiSpeakerVoiceConfig ( speaker_voice_configs = [ types . SpeakerVoiceConfig ( speaker = 'Dr. Anya' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), types . SpeakerVoiceConfig ( speaker = 'Liam' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Puck' , ) ) ), ] ) ) ) ) # ...Code to stream or save the output JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const transcript = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Generate a short transcript around 100 words that reads like it was clipped from a podcast by excited herpetologists. The hosts names are Dr. Anya and Liam." , }) const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : transcript , config : { responseModalities : [ 'AUDIO' ], speechConfig : { multiSpeakerVoiceConfig : { speakerVoiceConfigs : [ { speaker : "Dr. Anya" , voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" }, } }, { speaker : "Liam" , voiceConfig : { prebuiltVoiceConfig : { voiceName : "Puck" }, } } ] } } } }); } // ..JavaScript code for exporting .wav file for output audio await main (); Voice options TTS models support the following 30 voice options in the voice_name field: Zephyr -- Bright Puck -- Upbeat Charon -- Informative Kore -- Firm Fenrir -- Excitable Leda -- Youthful Orus -- Firm \ No newline at end of file diff --git a/docstore/10c980d1-6968-4997-bd48-def736b150ac b/docstore/10c980d1-6968-4997-bd48-def736b150ac new file mode 100644 index 0000000000000000000000000000000000000000..acef50f8402c486c348013efe146de15b88cb32b --- /dev/null +++ b/docstore/10c980d1-6968-4997-bd48-def736b150ac @@ -0,0 +1 @@ +Function calling with the Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Function calling with the Gemini API Function calling lets you connect models to external tools and APIs. Instead of generating text responses, the model determines when to call specific functions and provides the necessary parameters to execute real-world actions. This allows the model to act as a bridge between natural language and real-world actions and data. Function calling has 3 primary use cases: Augment Knowledge: Access information from external sources like databases, APIs, and knowledge bases. Extend Capabilities: Use external tools to perform computations and extend the limitations of the model, such as using a calculator or creating charts. Take Actions: Interact with external systems using APIs, such as scheduling appointments, creating invoices, sending emails, or controlling smart home devices. Get Weather Schedule Meeting Create Chart How function calling works Function calling involves a structured interaction between your application, the model, and external functions. Here's a breakdown of the process: Define Function Declaration: Define the function declaration in your application code. Function Declarations describe the function's name, parameters, and purpose to the model. Call LLM with function declarations: Send user prompt along with the function declaration(s) to the model. It analyzes the request and determines if a function call would be helpful. If so, it responds with a structured JSON object. Execute Function Code (Your Responsibility): The Model does not execute the function itself. It's your application's responsibility to \ No newline at end of file diff --git a/docstore/10d036d3-7090-4b94-9c26-62ac42533b2b b/docstore/10d036d3-7090-4b94-9c26-62ac42533b2b new file mode 100644 index 0000000000000000000000000000000000000000..a93f111d87cca3d0226fc4cdf96775703aa03950 --- /dev/null +++ b/docstore/10d036d3-7090-4b94-9c26-62ac42533b2b @@ -0,0 +1 @@ +genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMAGE_PATH = "path/to/sample.jpg" MIME_TYPE = $( file -b --mime-type " ${ IMAGE_PATH } " ) NUM_BYTES = $( wc -c < " ${ IMAGE_PATH } " ) DISPLAY_NAME = IMAGE tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Caption this image."}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Prompting with multiple images You can provide multiple images in a \ No newline at end of file diff --git a/docstore/10d16535-6903-4ac1-a74e-e3c147e49317 b/docstore/10d16535-6903-4ac1-a74e-e3c147e49317 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/10d16535-6903-4ac1-a74e-e3c147e49317 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/10d543ab-e675-4ff8-98a4-6ee84af99464 b/docstore/10d543ab-e675-4ff8-98a4-6ee84af99464 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/10d543ab-e675-4ff8-98a4-6ee84af99464 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/10f1d82e-2694-410f-a577-ef89e3cf17c8 b/docstore/10f1d82e-2694-410f-a577-ef89e3cf17c8 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/10f1d82e-2694-410f-a577-ef89e3cf17c8 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/110b0c02-f715-45da-8cdd-e922715246a1 b/docstore/110b0c02-f715-45da-8cdd-e922715246a1 new file mode 100644 index 0000000000000000000000000000000000000000..6acfe7ba02c229c83724a8d4179d85fae6727e02 --- /dev/null +++ b/docstore/110b0c02-f715-45da-8cdd-e922715246a1 @@ -0,0 +1 @@ +images before performing its reasoning task. Prompt Model response What is in common between these images? All four images are examples of landscape photography. Updated prompt Improved response First, describe what's in each image in detail. What's in common between these images? The first image shows a person packing a suitcase. The second image shows an empty hallway in a shopping mall. The third image shows a roll of toilet paper. The fourth image shows a dark and stormy sea. The commonality between these images is that they are all examples of liminal spaces. Liminal spaces are places or situations that are in between two other places or situations. They are often characterized by a sense of uncertainty or ambiguity. The first image shows the person in between being at home and being on a trip. The second image shows the hallway in between two stores. The third image is of toilet paper, which can represent a bathroom. A bathroom is a transitional space between private and public realms. It's a place where we prepare ourselves to enter or leave the outside world. The fourth image shows the sea in between being calm and being stormy. Liminal spaces can be both exciting and unsettling. They can be a place of new beginning or a place of endings. They can be a place of reflection or a place of action. You can also try explicitly asking the model to refer to what's in the image. Updated prompt Improved response What is in common between these images? Refer to what's in the images in your response. All of the images are related to travel. The first image shows a person packing a suitcase, the second image shows an empty hallway in a shopping mall, the third image shows a stack of toilet paper, and the fourth image shows a dark and stormy sea. These images are all related to travel because they are all things that people need or use when they travel. The first image shows a person packing a suitcase, which is something that people do when they are going on a trip. The \ No newline at end of file diff --git a/docstore/113d6325-ef79-42d9-ac3c-aacc8100bdd8 b/docstore/113d6325-ef79-42d9-ac3c-aacc8100bdd8 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/113d6325-ef79-42d9-ac3c-aacc8100bdd8 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/113df0e3-0bf4-4a3d-b024-24acbaa8b606 b/docstore/113df0e3-0bf4-4a3d-b024-24acbaa8b606 new file mode 100644 index 0000000000000000000000000000000000000000..6acfe7ba02c229c83724a8d4179d85fae6727e02 --- /dev/null +++ b/docstore/113df0e3-0bf4-4a3d-b024-24acbaa8b606 @@ -0,0 +1 @@ +images before performing its reasoning task. Prompt Model response What is in common between these images? All four images are examples of landscape photography. Updated prompt Improved response First, describe what's in each image in detail. What's in common between these images? The first image shows a person packing a suitcase. The second image shows an empty hallway in a shopping mall. The third image shows a roll of toilet paper. The fourth image shows a dark and stormy sea. The commonality between these images is that they are all examples of liminal spaces. Liminal spaces are places or situations that are in between two other places or situations. They are often characterized by a sense of uncertainty or ambiguity. The first image shows the person in between being at home and being on a trip. The second image shows the hallway in between two stores. The third image is of toilet paper, which can represent a bathroom. A bathroom is a transitional space between private and public realms. It's a place where we prepare ourselves to enter or leave the outside world. The fourth image shows the sea in between being calm and being stormy. Liminal spaces can be both exciting and unsettling. They can be a place of new beginning or a place of endings. They can be a place of reflection or a place of action. You can also try explicitly asking the model to refer to what's in the image. Updated prompt Improved response What is in common between these images? Refer to what's in the images in your response. All of the images are related to travel. The first image shows a person packing a suitcase, the second image shows an empty hallway in a shopping mall, the third image shows a stack of toilet paper, and the fourth image shows a dark and stormy sea. These images are all related to travel because they are all things that people need or use when they travel. The first image shows a person packing a suitcase, which is something that people do when they are going on a trip. The \ No newline at end of file diff --git a/docstore/11498b54-c39b-44a8-8bba-4691fc255a15 b/docstore/11498b54-c39b-44a8-8bba-4691fc255a15 new file mode 100644 index 0000000000000000000000000000000000000000..0a87c36dd70c54ff062f2d96388cf5344fb768a1 --- /dev/null +++ b/docstore/11498b54-c39b-44a8-8bba-4691fc255a15 @@ -0,0 +1 @@ +process the response and check for Function Call, if Yes : Extract the name and args of the function and execute the corresponding function in your application. No: The model has provided a direct text response to the prompt (this flow is less emphasized in the example but is a possible outcome). Create User friendly response: If a function was executed, capture the result and send it back to the model in a subsequent turn of the conversation. It will use the result to generate a final, user-friendly response that incorporates the information from the function call. This process can be repeated over multiple turns, allowing for complex interactions and workflows. The model also supports calling multiple functions in a single turn ( parallel function calling ) and in sequence ( compositional function calling ). Step 1: Define a function declaration Define a function and its declaration within your application code that allows users to set light values and make an API request. This function could call external services or APIs. Python # Define a function that the model can call to control smart lights set_light_values_declaration = { "name" : "set_light_values" , "description" : "Sets the brightness and color temperature of a light." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "integer" , "description" : "Light level from 0 to 100. Zero is off and 100 is full brightness" , }, "color_temp" : { "type" : "string" , "enum" : [ "daylight" , "cool" , "warm" ], "description" : "Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`." , }, }, "required" : [ "brightness" , "color_temp" ], }, } # This is the actual function that would be called based on the model's suggestion def set_light_values ( brightness : int , color_temp : str ) - > dict [ str , int | str ]: """Set the brightness and color temperature of a room light. (mock API). Args: brightness: Light level from 0 to 100. Zero is off and 100 is full \ No newline at end of file diff --git a/docstore/11573f29-040a-4f93-9c3f-851d9e1d5d53 b/docstore/11573f29-040a-4f93-9c3f-851d9e1d5d53 new file mode 100644 index 0000000000000000000000000000000000000000..8b73f4a836a67cf93dfb23276728e315a19b4e83 --- /dev/null +++ b/docstore/11573f29-040a-4f93-9c3f-851d9e1d5d53 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/rate-limits#usage-tiers Title: Rate limits | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/1161a596-1d40-450e-bc30-c816954804d7 b/docstore/1161a596-1d40-450e-bc30-c816954804d7 new file mode 100644 index 0000000000000000000000000000000000000000..1dc556e1b0caa0a5554ef0b35478c8c26ac17b7d --- /dev/null +++ b/docstore/1161a596-1d40-450e-bc30-c816954804d7 @@ -0,0 +1 @@ +print ( response . text ) JavaScript and TypeScript You can access both Gemini Developer API and Vertex AI services through @google/genai library. See libraries page for instructions on how to install @google/genai . Gemini Developer API import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Explain how AI works in a few words" , }); console . log ( response . text ); } main (); Vertex AI Gemini API import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ vertexai : true , project : 'your_project' , location : 'your_location' , }); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Explain how AI works in a few words" , }); console . log ( response . text ); } main (); Go You can access both Gemini Developer API and Vertex AI services through google.golang.org/genai library. See libraries page for instructions on how to install google.golang.org/genai . Gemini Developer API import ( "context" "encoding/json" "fmt" "log" "google.golang.org/genai" ) // Your Google API key const apiKey = "your-api-key" func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Call the GenerateContent method. result , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , genai . Text ( "Tell me about New York?" ), nil ) } Vertex AI Gemini API import ( "context" "encoding/json" "fmt" "log" "google.golang.org/genai" ) // Your GCP project const project = "your-project" // A GCP location like "us-central1" const location = "some-gcp-location" func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , & genai . ClientConfig { Project : project , Location : location , Backend : genai . BackendVertexAI , }) // Call the GenerateContent method. \ No newline at end of file diff --git a/docstore/116bf3fc-e0d6-4db1-88d5-49b2a51cab47 b/docstore/116bf3fc-e0d6-4db1-88d5-49b2a51cab47 new file mode 100644 index 0000000000000000000000000000000000000000..4b5f15989e784aa4b4f5462e86ee08ece0d0f480 --- /dev/null +++ b/docstore/116bf3fc-e0d6-4db1-88d5-49b2a51cab47 @@ -0,0 +1 @@ +pathlib . Path ( 'a11.txt' ) . write_text ( response . text ) my_file = client . files . upload ( file = 'a11.txt' ) response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = [ 'Can you summarize this file:' , my_file ] ) print ( response . text ) List and get List uploaded files and get an uploaded file with a filename: Before Python import google.generativeai as genai for file in genai . list_files (): print ( file . name ) file = genai . get_file ( name = file . name ) After Python from google import genai client = genai . Client () for file in client . files . list (): print ( file . name ) file = client . files . get ( name = file . name ) Delete Delete a file: Before Python import pathlib import google.generativeai as genai pathlib . Path ( 'dummy.txt' ) . write_text ( dummy ) dummy_file = genai . upload_file ( path = 'dummy.txt' ) file = genai . delete_file ( name = dummy_file . name ) After Python import pathlib from google import genai client = genai . Client () pathlib . Path ( 'dummy.txt' ) . write_text ( dummy ) dummy_file = client . files . upload ( file = 'dummy.txt' ) response = client . files . delete ( name = dummy_file . name ) Context caching Context caching allows the user to pass the content to the model once, cache the input tokens, and then refer to the cached tokens in subsequent calls to lower the cost. Before Python import requests import pathlib import google.generativeai as genai from google.generativeai import caching # Download file response = requests . get ( 'https://storage.googleapis.com/generativeai-downloads/data/a11.txt' ) pathlib . Path ( 'a11.txt' ) . write_text ( response . text ) # Upload file document = genai . upload_file ( path = "a11.txt" ) # Create cache apollo_cache = caching . CachedContent . create ( model = "gemini-1.5-flash-001" , system_instruction = "You are an expert at analyzing transcripts." , contents = [ document ], ) # Generate response apollo_model = genai . GenerativeModel . \ No newline at end of file diff --git a/docstore/11743ee2-0f7a-4718-87d3-6e906918876c b/docstore/11743ee2-0f7a-4718-87d3-6e906918876c new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/11743ee2-0f7a-4718-87d3-6e906918876c @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/11c4212b-2982-4488-9c35-a73d4f1a93bb b/docstore/11c4212b-2982-4488-9c35-a73d4f1a93bb new file mode 100644 index 0000000000000000000000000000000000000000..433635003046509e85b7917fbaa1cad75744aec9 --- /dev/null +++ b/docstore/11c4212b-2982-4488-9c35-a73d4f1a93bb @@ -0,0 +1 @@ +GenerateContentRequest inline_requests = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Tell me a one-sentence joke.' }], 'role' : 'user' }] }, { 'contents' : [{ 'parts' : [{ 'text' : 'Why is the sky blue?' }], 'role' : 'user' }] } ] inline_batch_job = client . batches . create ( model = "models/gemini-2.5-flash" , src = inline_requests , config = { 'display_name' : "inlined-requests-job-1" , }, ) print ( f "Created batch job: { inline_batch_job . name } " ) REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:batchGenerateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -X POST \ -H "Content-Type:application/json" \ -d '{ "batch": { "display_name": "my-batch-requests", "input_config": { "requests": { "requests": [ { "request": {"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}]}, "metadata": { "key": "request-1" } }, { "request": {"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}]}, "metadata": { "key": "request-2" } } ] } } } }' You can use any requests you would use in non-batch (or interactive) mode. For example, you could specify the temperature, system instructions or even pass in other modalities. The following example shows some example inline requests that contain a system instruction for one of the requests: inline_requests_list = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Write a short poem about a cloud.' }]}]}, { 'contents' : [{ 'parts' : [{ 'text' : 'Write a short poem about a cat.' }]}], 'system_instructions' : { 'parts' : [{ 'text' : 'You are a cat. Your name is Neko.' }]}} ] Similarly can also specify tools to use for a request. The following example shows a request that enables the Google Search tool : inline_requests_list = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Who won the euro 1998?' }]}]}, { 'contents' : [{ 'parts' : [{ 'text' : 'Who won the euro 2025?' }]}], 'tools' : [{ 'google_search ' : {}}]} ] Input file For larger sets of requests, prepare a JSON Lines \ No newline at end of file diff --git a/docstore/11cfa2e6-f24c-467c-9746-b5b81f8ec81c b/docstore/11cfa2e6-f24c-467c-9746-b5b81f8ec81c new file mode 100644 index 0000000000000000000000000000000000000000..c25aa08b43110ad03ba57bdce48865495fdfb941 --- /dev/null +++ b/docstore/11cfa2e6-f24c-467c-9746-b5b81f8ec81c @@ -0,0 +1 @@ +fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ], "generationConfig": { "thinkingConfig": { "thinkingBudget": 0 } } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } System instructions and other configurations You can guide the behavior of Gemini models with system instructions. To do so, pass a GenerateContentConfig object. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , config = types . GenerateContentConfig ( system_instruction = "You are a cat. Your name is Neko." ), contents = "Hello there" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Hello there" , config : { systemInstruction : "You are a cat. Your name is Neko." , }, }); console . log ( response . text ); } await main (); \ No newline at end of file diff --git a/docstore/125abadd-7760-4a3b-8790-3b7d6c3a18a3 b/docstore/125abadd-7760-4a3b-8790-3b7d6c3a18a3 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/125abadd-7760-4a3b-8790-3b7d6c3a18a3 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/12668c2b-3e33-44e1-9d41-7452c9642e94 b/docstore/12668c2b-3e33-44e1-9d41-7452c9642e94 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/12668c2b-3e33-44e1-9d41-7452c9642e94 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/1275b0db-efea-424d-beb6-3a6da47f13e9 b/docstore/1275b0db-efea-424d-beb6-3a6da47f13e9 new file mode 100644 index 0000000000000000000000000000000000000000..53d3e426b4ff6e145f59bebdb86773397956de24 --- /dev/null +++ b/docstore/1275b0db-efea-424d-beb6-3a6da47f13e9 @@ -0,0 +1 @@ +field responseJsonSchema which accepts any JSON Schema with the following limitations: It only works with Gemini 2.5. While all JSON Schema properties can be passed, not all are supported. See the documentation for the field for more details. Recursive references can only be used as the value of a non-required object property. Recursive references are unrolled to a finite degree, based on the size of the schema. Schemas that contain $ref cannot contain any properties other than those starting with a $ . Here's an example of generating a JSON Schema with Pydantic and submitting it to the model: curl "https://generativelanguage.googleapis.com/v1alpha/models/\ gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d @- < /dev/null > batch_status.json if jq -r '.done' batch_status.json | grep -q "false" ; then echo "Batch has not finished processing" fi batch_state = $( jq -r '.metadata.state' \ No newline at end of file diff --git a/docstore/128042c3-9a5c-4a3e-8b9a-9f010923b827 b/docstore/128042c3-9a5c-4a3e-8b9a-9f010923b827 new file mode 100644 index 0000000000000000000000000000000000000000..03e11f1b3b2cd84e15c6098d543ad30ece4e0a72 --- /dev/null +++ b/docstore/128042c3-9a5c-4a3e-8b9a-9f010923b827 @@ -0,0 +1 @@ +Friday." }, ], response_format = CalendarEvent , ) print ( completion . choices [ 0 ] . message . parsed ) JavaScript import OpenAI from "openai" ; import { zodResponseFormat } from "openai/helpers/zod" ; import { z } from "zod" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai" }); const CalendarEvent = z . object ({ name : z . string (), date : z . string (), participants : z . array ( z . string ()), }); const completion = await openai . beta . chat . completions . parse ({ model : "gemini-2.0-flash" , messages : [ { role : "system" , content : "Extract the event information." }, { role : "user" , content : "John and Susan are going to an AI conference on Friday" }, ], response_format : zodResponseFormat ( CalendarEvent , "event" ), }); const event = completion . choices [ 0 ]. message . parsed ; console . log ( event ); Embeddings Text embeddings measure the relatedness of text strings and can be generated using the Gemini API . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . embeddings . create ( input = "Your text string goes here" , model = "text-embedding-004" ) print ( response . data [ 0 ] . embedding ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function main () { const embedding = await openai . embeddings . create ({ model : "text-embedding-004" , input : "Your text string goes here" , }); console . log ( embedding ); } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/embeddings" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "input": "Your text string goes here", "model": "text-embedding-004" }' extra_body There are several features supported by Gemini that \ No newline at end of file diff --git a/docstore/128db2cb-d72c-4edf-8b9d-25c8be0d7f5b b/docstore/128db2cb-d72c-4edf-8b9d-25c8be0d7f5b new file mode 100644 index 0000000000000000000000000000000000000000..5c1a9aa70176bd8ed0ea76a30ce2ef7a94d7e3a2 --- /dev/null +++ b/docstore/128db2cb-d72c-4edf-8b9d-25c8be0d7f5b @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/video-understanding Title: Video understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/1290bc9f-a12b-41e4-9805-04121e62b2ee b/docstore/1290bc9f-a12b-41e4-9805-04121e62b2ee new file mode 100644 index 0000000000000000000000000000000000000000..acef50f8402c486c348013efe146de15b88cb32b --- /dev/null +++ b/docstore/1290bc9f-a12b-41e4-9805-04121e62b2ee @@ -0,0 +1 @@ +Function calling with the Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Function calling with the Gemini API Function calling lets you connect models to external tools and APIs. Instead of generating text responses, the model determines when to call specific functions and provides the necessary parameters to execute real-world actions. This allows the model to act as a bridge between natural language and real-world actions and data. Function calling has 3 primary use cases: Augment Knowledge: Access information from external sources like databases, APIs, and knowledge bases. Extend Capabilities: Use external tools to perform computations and extend the limitations of the model, such as using a calculator or creating charts. Take Actions: Interact with external systems using APIs, such as scheduling appointments, creating invoices, sending emails, or controlling smart home devices. Get Weather Schedule Meeting Create Chart How function calling works Function calling involves a structured interaction between your application, the model, and external functions. Here's a breakdown of the process: Define Function Declaration: Define the function declaration in your application code. Function Declarations describe the function's name, parameters, and purpose to the model. Call LLM with function declarations: Send user prompt along with the function declaration(s) to the model. It analyzes the request and determines if a function call would be helpful. If so, it responds with a structured JSON object. Execute Function Code (Your Responsibility): The Model does not execute the function itself. It's your application's responsibility to \ No newline at end of file diff --git a/docstore/129a623d-6efb-4081-9702-c54359c14c93 b/docstore/129a623d-6efb-4081-9702-c54359c14c93 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/129a623d-6efb-4081-9702-c54359c14c93 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/12bc506b-63ec-41b2-9eaa-b832e48973f2 b/docstore/12bc506b-63ec-41b2-9eaa-b832e48973f2 new file mode 100644 index 0000000000000000000000000000000000000000..90311e83465fc930174e1787a7822757d628fc0a --- /dev/null +++ b/docstore/12bc506b-63ec-41b2-9eaa-b832e48973f2 @@ -0,0 +1 @@ +setLightValues ( tool_call . args . brightness , tool_call . args . color_temp ); console . log ( `Function execution result: ${ JSON . stringify ( result ) } ` ); } Step 4: Create user friendly response with function result and call the model again Finally, send the result of the function execution back to the model so it can incorporate this information into its final response to the user. Python # Create a function response part function_response_part = types . Part . from_function_response ( name = tool_call . name , response = { "result" : result }, ) # Append function call and result of the function execution to contents contents . append ( response . candidates [ 0 ] . content ) # Append the content from the model's response. contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Create a function response part const function_response_part = { name : tool_call . name , response : { result } } // Append function call and result of the function execution to contents contents . push ( response . candidates [ 0 ]. content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); // Get the final response from the model const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); This completes the function calling flow. The model successfully used the set_light_values function to perform the request action of the user. Function declarations When you implement function calling in a prompt, you create a tools object, which contains one or more function declarations . You define functions using JSON, specifically with a select subset of the OpenAPI schema format. A single function \ No newline at end of file diff --git a/docstore/12de5872-f055-439f-84af-f723454bb6e8 b/docstore/12de5872-f055-439f-84af-f723454bb6e8 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/12de5872-f055-439f-84af-f723454bb6e8 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/12ec23cb-3241-401a-b4fd-4e1d120a0c20 b/docstore/12ec23cb-3241-401a-b4fd-4e1d120a0c20 new file mode 100644 index 0000000000000000000000000000000000000000..9b0480f81615786d4da1611ac72ab96b5af43602 --- /dev/null +++ b/docstore/12ec23cb-3241-401a-b4fd-4e1d120a0c20 @@ -0,0 +1 @@ +"CiIBVKhc7oDPpCaXyJKKssjqr4g3JNOSgJ/M2V+1THC1icsWCmwBVKhc7pBABbZ+zR3e9234WnWWS6GFXmf8IVwpnzjd5KYd7vyJbn/4vTorWBGayj/vbd9JPaZQjxdAIXhoE5mX/MDsQ7M9N/b0qJjHm39tYIBvS4sIWkMDHqTJqXGLzhhKtrTkfbV3RbaJEkQKmwEBVKhc7qVUgC3hfTXZLo9R3AJzUUIx50NKvJTb9B+UU+LBqgg7Nck1x5OpjWVS2R+SsveprIuYOruk2Y0H53J2OJF8qsxTdIq2si8DGW2V7WK8xyoJH5kbqd7drIw1jLb44b6lx4SMyB0VaULuTBki4d+Ljjg1tJTwR0IYMKqDLDZt9mheINsi0ZxcNjfpnDydRXdWbcSwzmK/wgqJAQFUqFzuKgNVElxs3cbO+xebr2IwcOro84nKTisi0tTp9bICPC9fTUhn3L+rvQWA+d3J1Za8at2bakrqiRj7BTh+CVO9fWQMAEQAs3ni0Z2hfaYG92tOD26E4IoZwyYEoWbfNudpH1fr5tEkyqnEGtWIh7H+XoZQ2DXeiOa+br7Zk88SrNE+trJMCogBAVSoXO5e9fBLg7hnbkmKsrzNLnQtLsQm1gNzjcjEC7nJYklYPp0KI2uGBE1PkM8XNsfllAfHVn7LzHcHNlbQ9pJ7QZTSIeG42goS971r5wNZwxaXwCTphClQh826eqJWo6A/28TtAVQWLhTx5ekbP7qb4nh1UblESZ1saxDQAEo4OKPbDzx5BgqKAQFUqFzuVyjNm5i0wN8hTDnKjfpDroEpPPTs531iFy9BOX+xDCdGHy8D+osFpaoBq6TFekQQbz4hIoUR1YEcP4zI80/cNimEeb9IcFxZTTxiNrbhbbcv0969DSMWhB+ZEqIz4vuw4GLe/xcUvqhlChQwFdgIbdOQHSHpatn5uDlktnP/bi26nKuXIwo0AVSoXO7US22OUH7d1f4abNPI0IyAvhqkPp12rbtWLx9vkOtojE8IP+xCfYtIFuZIzRNZqA==" } ] , "role" : "model" } , { "role" : "user" , "parts" : [ { "functionResponse" : { "name" : "getWeather" , "response" : { "response" : { "stringValue" : "Sunny and hot. 90 degrees Fahrenheit" } } } } ] } ] , # Remainder of request... Learn more about limitations and usage of thought signatures, and about thinking models in general, on the Thinking page. Parallel function calling In addition to single turn function calling, you can also call multiple functions at once. Parallel function calling lets you execute multiple functions at once and is used when the functions are not dependent on each other. This is useful in scenarios like gathering data from multiple independent sources, such as retrieving customer details from different databases or checking inventory levels across various warehouses or performing multiple actions such as converting your apartment into a disco. Python power_disco_ball = { "name" : "power_disco_ball" , "description" \ No newline at end of file diff --git a/docstore/130ba120-88a1-4c2c-ae1b-198e5bccc573 b/docstore/130ba120-88a1-4c2c-ae1b-198e5bccc573 new file mode 100644 index 0000000000000000000000000000000000000000..a8da0e92acb39cd3b0a95b76d425835072520bc1 --- /dev/null +++ b/docstore/130ba120-88a1-4c2c-ae1b-198e5bccc573 @@ -0,0 +1 @@ +Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/docstore/1320970f-bba5-4822-92cc-9d73b8d67327 b/docstore/1320970f-bba5-4822-92cc-9d73b8d67327 new file mode 100644 index 0000000000000000000000000000000000000000..321d841188c5250c057a057a233b2d467cd53bf3 --- /dev/null +++ b/docstore/1320970f-bba5-4822-92cc-9d73b8d67327 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/text-generation?lang=python#generate-a-text-stream Title: Text generation | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/1324d20c-6da5-44d9-85bf-472c3b108266 b/docstore/1324d20c-6da5-44d9-85bf-472c3b108266 new file mode 100644 index 0000000000000000000000000000000000000000..38b0d511edf99b6e46520aefe0a8854155d4053f --- /dev/null +++ b/docstore/1324d20c-6da5-44d9-85bf-472c3b108266 @@ -0,0 +1 @@ +energetic music, and dimmed the lights to 50% brightness. Let's get this party started! Compositional function calling Compositional or sequential function calling allows Gemini to chain multiple function calls together to fulfill a complex request. For example, to answer "Get the temperature in my current location", the Gemini API might first invoke a get_current_location() function followed by a get_weather() function that takes the location as a parameter. The following example demonstrates how to implement compositional function calling using the Python SDK and automatic function calling. Python This example uses the automatic function calling feature of the google-genai Python SDK. The SDK automatically converts the Python functions to the required schema, executes the function calls when requested by the model, and sends the results back to the model to complete the task. import os from google import genai from google.genai import types # Example Functions def get_weather_forecast ( location : str ) - > dict : """Gets the current weather temperature for a given location.""" print ( f "Tool Call: get_weather_forecast(location= { location } )" ) # TODO: Make API call print ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ) return { "temperature" : 25 , "unit" : "celsius" } # Dummy response def set_thermostat_temperature ( temperature : int ) - > dict : """Sets the thermostat to a desired temperature.""" print ( f "Tool Call: set_thermostat_temperature(temperature= { temperature } )" ) # TODO: Interact with a thermostat API print ( "Tool Response: {'status': 'success'}" ) return { "status" : "success" } # Configure the client and model client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_weather_forecast , set_thermostat_temperature ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to \ No newline at end of file diff --git a/docstore/133a3dfc-b175-461c-803b-65e2f0a017bc b/docstore/133a3dfc-b175-461c-803b-65e2f0a017bc new file mode 100644 index 0000000000000000000000000000000000000000..6acfe7ba02c229c83724a8d4179d85fae6727e02 --- /dev/null +++ b/docstore/133a3dfc-b175-461c-803b-65e2f0a017bc @@ -0,0 +1 @@ +images before performing its reasoning task. Prompt Model response What is in common between these images? All four images are examples of landscape photography. Updated prompt Improved response First, describe what's in each image in detail. What's in common between these images? The first image shows a person packing a suitcase. The second image shows an empty hallway in a shopping mall. The third image shows a roll of toilet paper. The fourth image shows a dark and stormy sea. The commonality between these images is that they are all examples of liminal spaces. Liminal spaces are places or situations that are in between two other places or situations. They are often characterized by a sense of uncertainty or ambiguity. The first image shows the person in between being at home and being on a trip. The second image shows the hallway in between two stores. The third image is of toilet paper, which can represent a bathroom. A bathroom is a transitional space between private and public realms. It's a place where we prepare ourselves to enter or leave the outside world. The fourth image shows the sea in between being calm and being stormy. Liminal spaces can be both exciting and unsettling. They can be a place of new beginning or a place of endings. They can be a place of reflection or a place of action. You can also try explicitly asking the model to refer to what's in the image. Updated prompt Improved response What is in common between these images? Refer to what's in the images in your response. All of the images are related to travel. The first image shows a person packing a suitcase, the second image shows an empty hallway in a shopping mall, the third image shows a stack of toilet paper, and the fourth image shows a dark and stormy sea. These images are all related to travel because they are all things that people need or use when they travel. The first image shows a person packing a suitcase, which is something that people do when they are going on a trip. The \ No newline at end of file diff --git a/docstore/1359a19b-f55d-435e-a115-ed63318ab2a3 b/docstore/1359a19b-f55d-435e-a115-ed63318ab2a3 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/1359a19b-f55d-435e-a115-ed63318ab2a3 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/135a3b27-0235-4c8f-abb3-8d97e86fe833 b/docstore/135a3b27-0235-4c8f-abb3-8d97e86fe833 new file mode 100644 index 0000000000000000000000000000000000000000..01ae62c8740ecd40460af64187bff6feef8cdae5 --- /dev/null +++ b/docstore/135a3b27-0235-4c8f-abb3-8d97e86fe833 @@ -0,0 +1 @@ +are not available in OpenAI models but can be enabled using the extra_body field. extra_body features safety_settings Corresponds to Gemini's SafetySetting . cached_content Corresponds to Gemini's GenerateContentRequest.cached_content . thinking_config Corresponds to Gemini's ThinkingConfig . cached_content Here's an example of using extra_body to set cached_content : Python from openai import OpenAI client = OpenAI ( api_key = MY_API_KEY , base_url = "https://generativelanguage.googleapis.com/v1beta/" ) stream = client . chat . completions . create ( model = "gemini-2.5-pro" , n = 1 , messages = [ { "role" : "user" , "content" : "Summarize the video" } ], stream = True , stream_options = { 'include_usage' : True }, extra_body = { 'extra_body' : { 'google' : { 'cached_content' : "cachedContents/0000aaaa1111bbbb2222cccc3333dddd4444eeee" } } } ) for chunk in stream : print ( chunk ) print ( chunk . usage . to_dict ()) List models Get a list of available Gemini models: Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) models = client . models . list () for model in models : print ( model . id ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); async function main () { const list = await openai . models . list (); for await ( const model of list ) { console . log ( model ); } } main (); REST curl https://generativelanguage.googleapis.com/v1beta/openai/models \ -H "Authorization: Bearer GEMINI_API_KEY" Retrieve a model Retrieve a Gemini model: Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) model = client . models . retrieve ( "gemini-2.0-flash" ) print ( model . id ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : \ No newline at end of file diff --git a/docstore/13700397-744f-43e5-a29e-91c2395d4361 b/docstore/13700397-744f-43e5-a29e-91c2395d4361 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/13700397-744f-43e5-a29e-91c2395d4361 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/137054c8-b410-4914-a99a-89dcbaeb656a b/docstore/137054c8-b410-4914-a99a-89dcbaeb656a new file mode 100644 index 0000000000000000000000000000000000000000..2437f77cb02a7dfc3b66d950f0fe4ad8777ea66f --- /dev/null +++ b/docstore/137054c8-b410-4914-a99a-89dcbaeb656a @@ -0,0 +1 @@ +SpeakerVoiceConfig ( speaker = 'Joe' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), types . SpeakerVoiceConfig ( speaker = 'Jane' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Puck' , ) ) ), ] ) ) ) ) data = response . candidates [ 0 ] . content . parts [ 0 ] . inline_data . data file_name = 'out.wav' wave_file ( file_name , data ) # Saves the file to current directory JavaScript import { GoogleGenAI } from '@google/genai' ; import wav from 'wav' ; async function saveWaveFile ( filename , pcmData , channels = 1 , rate = 24000 , sampleWidth = 2 , ) { return new Promise (( resolve , reject ) = > { const writer = new wav . FileWriter ( filename , { channels , sampleRate : rate , bitDepth : sampleWidth * 8 , }); writer . on ( 'finish' , resolve ); writer . on ( 'error' , reject ); writer . write ( pcmData ); writer . end (); }); } async function main () { const ai = new GoogleGenAI ({}); const prompt = `TTS the following conversation between Joe and Jane: Joe: How's it going today Jane? Jane: Not too bad, how about you?` ; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : [{ parts : [{ text : prompt }] }], config : { responseModalities : [ 'AUDIO' ], speechConfig : { multiSpeakerVoiceConfig : { speakerVoiceConfigs : [ { speaker : 'Joe' , voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Kore' } } }, { speaker : 'Jane' , voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Puck' } } } ] } } } }); const data = response . candidates ? .[ 0 ] ? . content ? . parts ? .[ 0 ] ? . inlineData ? . data ; const audioBuffer = Buffer . from ( data , 'base64' ); const fileName = 'out.wav' ; await saveWaveFile ( fileName , audioBuffer ); } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \ -H "x-goog-api-key: \ No newline at end of file diff --git a/docstore/13941de8-5260-4d20-949b-20a363aedd3c b/docstore/13941de8-5260-4d20-949b-20a363aedd3c new file mode 100644 index 0000000000000000000000000000000000000000..12eaa70edcc32ba40cc62f56233fa0adea94b474 --- /dev/null +++ b/docstore/13941de8-5260-4d20-949b-20a363aedd3c @@ -0,0 +1 @@ +Gemini API libraries | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini API libraries When building with the Gemini API, we recommend using our official collection of libraries across major languages: the Google GenAI SDK . They are production ready under General Availability . Our samples and documentation across this site are built using these libraries. Note: If you're using one of our legacy libraries, we strongly recommend you migrate to the Google GenAI SDK. Review the legacy libraries section for more information. If you're new to the Gemini API, follow our quickstart guide to get started. Language support and installation The Google GenAI SDK is available for the Python, JavaScript/TypeScript, Go and Java languages. You can install each language's library using package managers, or visit their GitHub repos for further engagement: Python Library: google-genai GitHub Repository: googleapis/python-genai Installation: pip install google-genai JavaScript Library: @google/genai GitHub Repository: googleapis/js-genai Installation: npm install @google/genai Go Library: google.golang.org/genai GitHub Repository: googleapis/go-genai Installation: go get google.golang.org/genai Java Library: google-genai GitHub Repository: googleapis/java-genai Installation: If you're using Maven, add the following to your dependencies: com.google.genai google-genai 1.0.0 General availability We started rolling out the Google GenAI SDK in late 2024. As of May 2025, it reached General Availability (GA) across all supported platforms. This means \ No newline at end of file diff --git a/docstore/13e00114-b5b8-44c0-9816-c0e51c4b626f b/docstore/13e00114-b5b8-44c0-9816-c0e51c4b626f new file mode 100644 index 0000000000000000000000000000000000000000..4f5a135f81a8ec2b4f9be3f8bcfa685a50149381 --- /dev/null +++ b/docstore/13e00114-b5b8-44c0-9816-c0e51c4b626f @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key= $GOOGLE_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Can you add a few more lines to this poem?"}, {"file_data":{"mime_type": "application/pdf", "file_uri": ' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json You can verify the API successfully stored the uploaded file and get its metadata by calling files.get . Only the name (and by extension, the uri ) are unique. Python from google import genai import pathlib client = genai . Client () fpath = pathlib . Path ( 'example.txt' ) fpath . write_text ( 'hello' ) file = client . files . upload ( file = 'example.txt' ) file_info = client . files . get ( name = file . name ) print ( file_info . model_dump_json ( indent = 4 )) REST name = $( jq ".file.name" file_info.json ) # Get the file of interest to check state curl https://generativelanguage.googleapis.com/v1beta/files/ $name > file_info.json # Print some information about the file you got name = $( jq ".file.name" file_info.json ) echo name = $name file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri Passing multiple PDFs The Gemini API is capable of processing multiple PDF documents (up to 1000 pages) in a single request, as long as the combined size of the documents and the text prompt stays within the model's context window. Python from google import genai import io import httpx client = genai . Client () doc_url_1 = "https://arxiv.org/pdf/2312.11805" doc_url_2 = "https://arxiv.org/pdf/2403.05530" # Retrieve and upload both PDFs using the File API doc_data_1 = io . BytesIO ( httpx . get ( doc_url_1 ) . content ) doc_data_2 = io . BytesIO ( httpx . get ( doc_url_2 ) . content ) sample_pdf_1 = client . files . upload ( file = doc_data_1 , config = dict ( mime_type = 'application/pdf' ) ) sample_pdf_2 = client . files . \ No newline at end of file diff --git a/docstore/13e1ca8f-0feb-4ede-b6d3-0b3a78de84c6 b/docstore/13e1ca8f-0feb-4ede-b6d3-0b3a78de84c6 new file mode 100644 index 0000000000000000000000000000000000000000..9bdceff10febb2c20e655c9c51c8df9e3aef858d --- /dev/null +++ b/docstore/13e1ca8f-0feb-4ede-b6d3-0b3a78de84c6 @@ -0,0 +1 @@ +Using different words or phrasing in your prompts often yields different responses from the model even though they all mean the same thing. If you're not getting the expected results from your prompt, try rephrasing it. Version 1: How do I bake a pie? Version 2: Suggest a recipe for a pie. Version 3: What's a good pie recipe? Switch to an analogous task: If you can't get the model to follow your instructions for a task, try giving it instructions for an analogous task that achieves the same result. This prompt tells the model to categorize a book by using predefined categories: Prompt: Which category does The Odyssey belong to: thriller sci-fi mythology biography Response: The Odyssey belongs to the category of **mythology**. Here's why: * **Mythology:** The Odyssey tells the story of Odysseus, a hero from Greek mythology, and his journey home after the Trojan War. It features gods, monsters, and supernatural events common to Greek mythology. ..... (gemini-2.5-flash) The response is correct, but the model didn't stay within the bounds of the options. You also want to model to just respond with one of the options instead of in a full sentence. In this case, you can rephrase the instructions as a multiple choice question and ask the model to choose an option. Prompt: Multiple choice problem: Which of the following options describes the book The Odyssey? Options: thriller sci-fi mythology biography Response: The correct answer is mythology . (gemini-2.5-flash) Change the order of prompt content: The order of the content in the prompt can sometimes affect the response. Try changing the content order and see how that affects the response. Version 1 : [ examples ] [ context ] [ input ] Version 2 : [ input ] [ examples ] [ context ] Version 3 : [ examples ] [ input ] [ context ] Fallback responses A fallback response is a response returned by the model when either the prompt or the response triggers a safety filter. An example of a fallback response is "I'm not able to \ No newline at end of file diff --git a/docstore/13f172f0-8426-4d24-bcbb-50b040eed76a b/docstore/13f172f0-8426-4d24-bcbb-50b040eed76a new file mode 100644 index 0000000000000000000000000000000000000000..9cff723635cec00cbce8e3bb3861a0bdc7cb1f7f --- /dev/null +++ b/docstore/13f172f0-8426-4d24-bcbb-50b040eed76a @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-embedding Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/13f9ac18-19e8-4130-a29b-8309c124dcd4 b/docstore/13f9ac18-19e8-4130-a29b-8309c124dcd4 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/13f9ac18-19e8-4130-a29b-8309c124dcd4 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/14511d4c-97a3-4b48-aabc-c58cf02d4cd3 b/docstore/14511d4c-97a3-4b48-aabc-c58cf02d4cd3 new file mode 100644 index 0000000000000000000000000000000000000000..08922eb1e5da83e7a67a2a4aeaf4437890d1333a --- /dev/null +++ b/docstore/14511d4c-97a3-4b48-aabc-c58cf02d4cd3 @@ -0,0 +1 @@ +trademark of Oracle and/or its affiliates. Last updated 2025-05-31 UTC. \ No newline at end of file diff --git a/docstore/146e5790-8fb5-4b6f-a708-b9207a9b2414 b/docstore/146e5790-8fb5-4b6f-a708-b9207a9b2414 new file mode 100644 index 0000000000000000000000000000000000000000..5df22ab370b3c2108c2a4e677731cc4af2835ff9 --- /dev/null +++ b/docstore/146e5790-8fb5-4b6f-a708-b9207a9b2414 @@ -0,0 +1 @@ +variables, if you don't pass one to the client. export GEMINI_API_KEY = "YOUR_API_KEY" from google import genai client = genai . Client () # Set the API key using the GEMINI_API_KEY env var. # Alternatively, you could set the API key explicitly: # client = genai.Client(api_key="your_api_key") JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); Go Import the GenAI library: import "google.golang.org/genai" Create the client: client , err := genai . NewClient ( ctx , & genai . ClientConfig { Backend : genai . BackendGeminiAPI , }) Generate content Text Before Python Previously, there were no client objects, you accessed APIs directly through GenerativeModel objects. import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ( 'Tell me a story in 300 words' ) print ( response . text ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( process . env . API_KEY ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); const prompt = "Tell me a story in 300 words" ; const result = await model . generateContent ( prompt ); console . log ( result . response . text ()); Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( "gemini-1.5-flash" ) resp , err := model . GenerateContent ( ctx , genai . Text ( "Tell me a story in 300 words." )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) // utility for printing response parts After Python The new Google GenAI SDK provides access to all the API methods through the Client object. Except for a few stateful special cases ( chat and live-api session s), these are all stateless functions. For utility and uniformity, objects returned are pydantic classes. \ No newline at end of file diff --git a/docstore/146e68d4-52d3-4d5b-9a73-c3b80f3860e6 b/docstore/146e68d4-52d3-4d5b-9a73-c3b80f3860e6 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/146e68d4-52d3-4d5b-9a73-c3b80f3860e6 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/147157b1-8285-4d16-8e78-a14b9a3b3caf b/docstore/147157b1-8285-4d16-8e78-a14b9a3b3caf new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/147157b1-8285-4d16-8e78-a14b9a3b3caf @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/147c0883-66a9-40ec-8fe7-ac715704e655 b/docstore/147c0883-66a9-40ec-8fe7-ac715704e655 new file mode 100644 index 0000000000000000000000000000000000000000..8ae6e7f8faf81825a858a0fb6935f6e2a3e8dc09 --- /dev/null +++ b/docstore/147c0883-66a9-40ec-8fe7-ac715704e655 @@ -0,0 +1 @@ +help with that, as I'm only a language model." If the model responds with a fallback response, try increasing the temperature. Things to avoid Avoid relying on models to generate factual information. Use with care on math and logic problems. Generative models under the hood This section aims to answer the question - Is there randomness in generative models' responses, or are they deterministic? The short answer - yes to both. When you prompt a generative model, a text response is generated in two stages. In the first stage, the generative model processes the input prompt and generates a probability distribution over possible tokens (words) that are likely to come next. For example, if you prompt with the input text "The dog jumped over the ... ", the generative model will produce an array of probable next words: [("fence", 0.77), ("ledge", 0.12), ("blanket", 0.03), ...] This process is deterministic; a generative model will produce this same distribution every time it's input the same prompt text. In the second stage, the generative model converts these distributions into actual text responses through one of several decoding strategies. A simple decoding strategy might select the most likely token at every timestep. This process would always be deterministic. However, you could instead choose to generate a response by randomly sampling over the distribution returned by the model. This process would be stochastic (random). Control the degree of randomness allowed in this decoding process by setting the temperature. A temperature of 0 means only the most likely tokens are selected, and there's no randomness. Conversely, a high temperature injects a high degree of randomness into the tokens selected by the model, leading to more unexpected, surprising model responses. Next steps Now that you have a deeper understanding of prompt design, try writing your own prompts using Google AI Studio . To learn about multimodal prompting, see Prompting with media files . To learn \ No newline at end of file diff --git a/docstore/1482527f-b85a-4f35-a643-c930ea04e960 b/docstore/1482527f-b85a-4f35-a643-c930ea04e960 new file mode 100644 index 0000000000000000000000000000000000000000..d028e190ded4732dff341fe647fe7532c210272b --- /dev/null +++ b/docstore/1482527f-b85a-4f35-a643-c930ea04e960 @@ -0,0 +1 @@ +second image shows an empty hallway in a shopping mall, which is a place where people often go to buy things for their trips. The third image shows a stack of toilet paper, which is something that people often use when they are traveling. The fourth image shows a dark and stormy sea, which is something that people often encounter when they are traveling. The same tips work for creative output from images. In the following example with the rabbit, explicitly asking the model to include aspects from the image can improve the output. Prompt Model response Can you write me a poem about this image? Little bunny, so soft and brown, Hopping through the field, up and down. Your ears so long, your tail so small, You're the cutest creature of them all. You nibble on the grass, so green and sweet, And hop away when you see me. But I'll always be your friend, Little bunny, until the very end. Updated prompt Improved response Can you write me a descriptive and dramatic poem about this image and include the location? Little bunny on the green, Ears so long, so keen. What do you see out there, In the distance, where you stare? Is it the ocean blue, Sparkling in the sun's hue? Or the waves that crash and roar, A rhythm that you adore? Little bunny, tell me true, What captures your attention so? Is it the salty breeze, Or the seagulls' cries that please? Perhaps it's the vastness of it all, A world so big and small. A place where dreams can roam, And adventures can be sown. Little bunny, I wonder what you dream, As you sit on the grass, so serene. Do you yearn to explore the deep, Or stay on land, where you can leap? Whatever it may be, little bunny, Keep that spark of wonder burning brightly. For in your dreams and aspirations, Lies a world waiting for your creation. Troubleshooting which part of the prompt failed It can be hard to know whether a prompt failed because the model didn't understand the image to begin with, or if it did understand the image but did not perform the \ No newline at end of file diff --git a/docstore/148c366d-553b-42f5-a4c1-cbd327058e9b b/docstore/148c366d-553b-42f5-a4c1-cbd327058e9b new file mode 100644 index 0000000000000000000000000000000000000000..c95ce8529f78ed9807c80ac97da2c9c530df9edf --- /dev/null +++ b/docstore/148c366d-553b-42f5-a4c1-cbd327058e9b @@ -0,0 +1 @@ +GenerateContentConfig { ResponseModalities : [] string { "TEXT" , "IMAGE" }, } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash-preview-image-generation" , genai . Text ( "Hi, can you create a 3d rendered image of a pig " + "with wings and a top hat flying over a happy " + "futuristic scifi city with lots of greenery?" ), config , ) for _ , part := range result . Candidates [ 0 ]. Content . Parts { if part . Text != "" { fmt . Println ( part . Text ) } else if part . InlineData != nil { imageBytes := part . InlineData . Data outputFilename := "gemini_generated_image.png" _ = os . WriteFile ( outputFilename , imageBytes , 0644 ) } } } REST curl -s -X POST "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-preview-image-generation:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts": [ {"text": "Hi, can you create a 3d rendered image of a pig with wings and a top hat flying over a happy futuristic scifi city with lots of greenery?"} ] }], "generationConfig":{"responseModalities":["TEXT","IMAGE"]} }' \ | grep -o '"data": "[^"]*"' \ | cut -d '"' -f4 \ | base64 --decode > gemini-native-image.png AI-generated image of a fantastical flying pig Image editing (text-and-image-to-image) To perform image editing, add an image as input. The following example demonstrates uploading base64 encoded images. For multiple images and larger payloads, check the image input section. Python from google import genai from google.genai import types from PIL import Image from io import BytesIO import PIL.Image image = PIL . Image . open ( '/path/to/image.png' ) client = genai . Client () text_input = ( 'Hi, This is a picture of me.' 'Can you add a llama next to me?' ,) response = client . models . generate_content ( model = "gemini-2.0-flash-preview-image-generation" , contents = [ text_input , image ], config = types . GenerateContentConfig ( response_modalities = [ 'TEXT' \ No newline at end of file diff --git a/docstore/148c6c43-ee15-41f5-98cf-8f9584bdba82 b/docstore/148c6c43-ee15-41f5-98cf-8f9584bdba82 new file mode 100644 index 0000000000000000000000000000000000000000..fc6b27785ffc2bef06b5f2067401216e3ff34ff1 --- /dev/null +++ b/docstore/148c6c43-ee15-41f5-98cf-8f9584bdba82 @@ -0,0 +1 @@ +"cheeseburger" was excluded from the output because it wasn't a part of the order. Constraints Specify any constraints on reading the prompt or generating a response. You can tell the model what to do and not to do. For example, you can specify a constraint in the prompt on how long you want a summary to be: Prompt: Summarize this text in one sentence: Text: A quantum computer exploits quantum mechanical phenomena to perform calculations exponentially faster than any modern traditional computer. At very tiny scales, physical matter acts as both particles and as waves, and quantum computing uses specialized hardware to leverage this behavior. The operating principles of quantum devices is beyond the scope of classical physics. When deployed at scale, quantum computers could be used in a wide variety of applications such as: in cybersecurity to break existing encryption methods while helping researchers create new ones, in meteorology to develop better weather forecasting etc. However, the current state of the art quantum computers are still largely experimental and impractical. Response: Exploiting quantum mechanical phenomena, quantum computers can perform calculations exponentially faster than traditional computers for potential applications like cybersecurity and meteorology, although they are currently largely experimental and impractical. (gemini-2.5-flash) Response format You can give instructions that specify the format of the response. For example, you can ask for the response to be formatted as a table, bulleted list, elevator pitch, keywords, sentence, or paragraph. The following system instruction tells the model to be more conversational in its response: System instruction All questions should be answered comprehensively with details, unless the user requests a concise response specifically. Prompt What is a smart way to make a business that sells DVD's in 2025? Response: Starting a business selling DVDs in 2025 presents significant challenges due to the \ No newline at end of file diff --git a/docstore/148c85e9-5351-464e-949b-08976df2c028 b/docstore/148c85e9-5351-464e-949b-08976df2c028 new file mode 100644 index 0000000000000000000000000000000000000000..6f28e48ebcfb70ccf6166568da4836eab8a3359a --- /dev/null +++ b/docstore/148c85e9-5351-464e-949b-08976df2c028 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#token-size Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/14948583-8837-4702-9817-a8b7bd700924 b/docstore/14948583-8837-4702-9817-a8b7bd700924 new file mode 100644 index 0000000000000000000000000000000000000000..454a32565b8ca4cb0483f43d22ff4a53388da470 --- /dev/null +++ b/docstore/14948583-8837-4702-9817-a8b7bd700924 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/ephemeral-tokens Title: Ephemeral tokens | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/14a2721a-bd32-46aa-ac18-c73e9442ff78 b/docstore/14a2721a-bd32-46aa-ac18-c73e9442ff78 new file mode 100644 index 0000000000000000000000000000000000000000..a6b24f2239a055f1d65a9fc8ecf3b6c3213634db --- /dev/null +++ b/docstore/14a2721a-bd32-46aa-ac18-c73e9442ff78 @@ -0,0 +1 @@ +min | 12/2/2023 | 12/4/2023 | | Physics | In progress | 2 hrs | 12/3/2023 | 12/5/2023 | | Biology | Not started | 6 hrs | 11/2/2016 | 12/6/2023 | Prompt Model response Provide a list of all the following attributes: ingredients, type of cuisine, vegetarian or not, in JSON format ```json { "ingredients": [ "rice", "shrimp", "clams", "mussels", "peas", "tomatoes", "onions", "garlic", "olive oil", "paprika", "salt", "pepper" ], "type of cuisine": "Spanish", "vegetarian": false } ``` Put your image first for single-image prompts While Gemini can interpret image and text in any order within a prompt, placing a single image before the text prompt might lead to better results. In the following samples, you'll notice the image comes first before the text in each prompt. Troubleshooting your multimodal prompt You might need to troubleshoot your prompt if you are not getting a helpful response. Here are a few strategies you could try. If the model is not drawing information from the relevant part of the image To get a more specific response, you can point out which aspects of the image you want the prompt to draw information from to inform its response. Prompt Model response How many days will these diapers last a baby? The diapers will last for 198 days before they run out. Updated prompt Improved response How long will these diapers last before I run out? Use the weight shown on the box to determine the child's age, and use the total number of diapers in the box. Divide the total number by how many diapers the child goes through per day. The box contains 198 diapers. The box states the diapers are for a child weighing 22-37 lbs. The average child in this weight range is likely between 1 and 2 years old. A 2-year-old typically goes through 6 diapers per day, therefore, the diapers will last around 33 days (198/6). If the model output is too generic and not tailored enough to the image input To help the model tailor its response to the image(s), try asking it to describe the \ No newline at end of file diff --git a/docstore/14a28af6-9ea2-445c-be35-177417d2c4e6 b/docstore/14a28af6-9ea2-445c-be35-177417d2c4e6 new file mode 100644 index 0000000000000000000000000000000000000000..9dd2717ee06a5cb6666e2976e86c3492640fe1f5 --- /dev/null +++ b/docstore/14a28af6-9ea2-445c-be35-177417d2c4e6 @@ -0,0 +1 @@ +'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Refer to the GenerateContentConfig in our API reference for a complete list of configurable parameters and their descriptions. Multimodal inputs The Gemini API supports multimodal inputs, allowing you to combine text with media files. The following example demonstrates providing an image: Python from PIL import Image from google import genai client = genai . Client () image = Image . open ( "/path/to/organ.png" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , "Tell me about this instrument" ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const image = await ai . files . upload ({ file : "/path/to/organ.png" , }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ createUserContent ([ "Tell me about this instrument" , createPartFromUri ( image . uri , image . mimeType ), ]), ], }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imagePath := "/path/to/organ.jpg" imgData , _ := os . ReadFile ( imagePath ) parts := [] * genai . Part { genai . NewPartFromText ( "Tell me about this instrument" ), & genai . Part { InlineData : & genai . Blob { MIMEType : "image/jpeg" , Data : imgData , \ No newline at end of file diff --git a/docstore/14aa27a2-56fa-4464-a18c-31dae0e6127d b/docstore/14aa27a2-56fa-4464-a18c-31dae0e6127d new file mode 100644 index 0000000000000000000000000000000000000000..e7df3841ca47d093238d5ecfeb9f7cc95942f8e5 --- /dev/null +++ b/docstore/14aa27a2-56fa-4464-a18c-31dae0e6127d @@ -0,0 +1 @@ +data image2Path := "path/to/image2.jpeg" imgBytes , _ := os . ReadFile ( image2Path ) parts := [] * genai . Part { genai . NewPartFromText ( "What is different between these two images?" ), genai . NewPartFromBytes ( imgBytes , "image/jpeg" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST # Upload the first image IMAGE1_PATH = "path/to/image1.jpg" MIME1_TYPE = $( file -b --mime-type " ${ IMAGE1_PATH } " ) NUM1_BYTES = $( wc -c < " ${ IMAGE1_PATH } " ) DISPLAY_NAME1 = IMAGE1 tmp_header_file1 = upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME1_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME1 } '}}" 2 > /dev/null upload_url1 = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file1 } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file1 } " curl " ${ upload_url1 } " \ -H "Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE1_PATH } " 2 > /dev/null > file_info1.json file1_uri = $( jq ".file.uri" file_info1.json ) echo file1_uri = $file1_uri # Prepare the second image (inline) IMAGE2_PATH = "path/to/image2.png" MIME2_TYPE = $( file -b --mime-type " ${ IMAGE2_PATH } " ) if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMAGE2_BASE64 = $( base64 $B64FLAGS $IMAGE2_PATH ) # Now generate content using both images curl \ No newline at end of file diff --git a/docstore/14b69e0e-e277-4ada-8a4e-0cc6459d3bac b/docstore/14b69e0e-e277-4ada-8a4e-0cc6459d3bac new file mode 100644 index 0000000000000000000000000000000000000000..b02538f85c1e26824fb9d15e124ac354f46dfed1 --- /dev/null +++ b/docstore/14b69e0e-e277-4ada-8a4e-0cc6459d3bac @@ -0,0 +1 @@ +temperature : int ) - > dict : """Sets the thermostat to a desired temperature.""" print ( f "Tool Call: set_thermostat_temperature(temperature= { temperature } )" ) # TODO: Interact with a thermostat API print ( "Tool Response: {'status': 'success'}" ) return { "status" : "success" } # Configure the client and model client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_weather_forecast , set_thermostat_temperature ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , config = config , ) # Print the final, user-facing response print ( response . text ) Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . I 've set the thermostat to 20°C. JavaScript This example shows how to use JavaScript/TypeScript SDK to do comopositional function calling using a manual execution loop. import { GoogleGenAI , Type } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Example Functions function get_weather_forecast ({ location }) { console . log ( `Tool Call: get_weather_forecast(location= ${ location } )` ); // TODO: Make API call console . log ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ); return { temperature : 25 , unit : "celsius" }; } function set_thermostat_temperature ({ temperature }) { console . log ( `Tool Call: set_thermostat_temperature(temperature= ${ temperature } )` , ); // TODO: Make API call console . log ( "Tool \ No newline at end of file diff --git a/docstore/14d7fcce-8710-400e-ae3f-ba93d53814e0 b/docstore/14d7fcce-8710-400e-ae3f-ba93d53814e0 new file mode 100644 index 0000000000000000000000000000000000000000..b1044b06e974ef70df5275060bd78c27b49af935 --- /dev/null +++ b/docstore/14d7fcce-8710-400e-ae3f-ba93d53814e0 @@ -0,0 +1 @@ +ordering of the examples is not consistent with the property ordering of the schema, the output could be rambling or unexpected. To ensure a consistent, predictable ordering of properties, you can use the optional propertyOrdering[] field. "propertyOrdering" : [ "recipeName" , "ingredients" ] propertyOrdering[] – not a standard field in the OpenAPI specification – is an array of strings used to determine the order of properties in the response. By specifying the order of properties and then providing examples with properties in that same order, you can potentially improve the quality of results. propertyOrdering is only supported when you manually create types.Schema . Schemas in Python When you're using the Python library, the value of response_schema must be one of the following: A type, as you would use in a type annotation (see the Python typing module ) An instance of genai.types.Schema The dict equivalent of genai.types.Schema The easiest way to define a schema is with a Pydantic type (as shown in the previous example): Python config = { 'response_mime_type' : 'application/json' , 'response_schema' : list [ Recipe ]} When you use a Pydantic type, the Python library builds out a JSON schema for you and sends it to the API. For additional examples, see the Python library docs . The Python library supports schemas defined with the following types (where AllowedType is any allowed type): int float bool str list[AllowedType] AllowedType|AllowedType|... For structured types: dict[str, AllowedType] . This annotation declares all dict values to be the same type, but doesn't specify what keys should be included. User-defined Pydantic models . This approach lets you specify the key names and define different types for the values associated with each of the keys, including nested structures. JSON Schema support JSON Schema is a more recent specification than OpenAPI 3.0, which the Schema object is based on. Support for JSON Schema is available as a preview using the \ No newline at end of file diff --git a/docstore/14fc7716-c408-4972-aafa-2b18ab40881a b/docstore/14fc7716-c408-4972-aafa-2b18ab40881a new file mode 100644 index 0000000000000000000000000000000000000000..7e8ad5dab4abf30afa5f2a7a1bb2bb58fcb2f5d0 --- /dev/null +++ b/docstore/14fc7716-c408-4972-aafa-2b18ab40881a @@ -0,0 +1 @@ +-X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"' " $MIME_TYPE " '", "data": "' " $IMAGE_B64 " '" } }, {"text": "Caption this image."} ] }] }' 2 > /dev/null Note: Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly. Uploading images using the File API For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to generateContent . See the Files API guide for more information and examples. Python from google import genai client = genai . Client () my_file = client . files . upload ( file = "path/to/sample.jpg" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ my_file , "Caption this image." ], ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.jpg" , config : { mimeType : "image/jpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Caption this image." , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.jpg" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Caption this image." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * \ No newline at end of file diff --git a/docstore/150634b1-94f0-404b-8f20-19cd25071472 b/docstore/150634b1-94f0-404b-8f20-19cd25071472 new file mode 100644 index 0000000000000000000000000000000000000000..f3a5c8d51af4fe74b88a61d9a283d0c7a963f683 --- /dev/null +++ b/docstore/150634b1-94f0-404b-8f20-19cd25071472 @@ -0,0 +1 @@ +workflows. The model also supports calling multiple functions in a single turn ( parallel function calling ) and in sequence ( compositional function calling ). Step 1: Define a function declaration Define a function and its declaration within your application code that allows users to set light values and make an API request. This function could call external services or APIs. Python # Define a function that the model can call to control smart lights set_light_values_declaration = { "name" : "set_light_values" , "description" : "Sets the brightness and color temperature of a light." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "integer" , "description" : "Light level from 0 to 100. Zero is off and 100 is full brightness" , }, "color_temp" : { "type" : "string" , "enum" : [ "daylight" , "cool" , "warm" ], "description" : "Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`." , }, }, "required" : [ "brightness" , "color_temp" ], }, } # This is the actual function that would be called based on the model's suggestion def set_light_values ( brightness : int , color_temp : str ) - > dict [ str , int | str ]: """Set the brightness and color temperature of a room light. (mock API). Args: brightness: Light level from 0 to 100. Zero is off and 100 is full brightness color_temp: Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. Returns: A dictionary containing the set brightness and color temperature. """ return { "brightness" : brightness , "colorTemperature" : color_temp } JavaScript import { Type } from '@google/genai' ; // Define a function that the model can call to control smart lights const setLightValuesFunctionDeclaration = { name : 'set_light_values' , description : 'Sets the brightness and color temperature of a light.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'Light level from 0 to 100. Zero is off \ No newline at end of file diff --git a/docstore/150deec6-877c-4c91-acde-eca33389cfc5 b/docstore/150deec6-877c-4c91-acde-eca33389cfc5 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/150deec6-877c-4c91-acde-eca33389cfc5 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/15137225-b9c6-4274-a9ae-db7f6d1fb4f5 b/docstore/15137225-b9c6-4274-a9ae-db7f6d1fb4f5 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/15137225-b9c6-4274-a9ae-db7f6d1fb4f5 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/152d9f7c-983b-4cb8-a343-ea48a109c120 b/docstore/152d9f7c-983b-4cb8-a343-ea48a109c120 new file mode 100644 index 0000000000000000000000000000000000000000..5ec9ece3755d9588328841497099189b5d8c6b8d --- /dev/null +++ b/docstore/152d9f7c-983b-4cb8-a343-ea48a109c120 @@ -0,0 +1 @@ +5. Forever Blooms Task Give me a simple list of just the things that I must bring on a camping trip. The list should have 5 items. 1. **Tent:** Your shelter from the elements. 2. **Sleeping Bag:** Stay warm and comfortable. 3. **Sleeping Pad:** Insulation and cushioning for a good night's sleep. 4. **Headlamp or Flashlight:** Light for nighttime activities. 5. **First Aid Kit:** Be prepared for minor injuries. Entity Classify the following items as [large, small]: Elephant Mouse Snail Here's the classification: * **Elephant:** Large * **Mouse:** Small * **Snail:** Small Partial input completion Generative language models work like an advanced auto completion tool. When you provide partial content, the model can provide the rest of the content or what it thinks is a continuation of that content as a response. When doing so, if you include any examples or context, the model can take those examples or context into account. The following example provides a prompt with an instruction and an entity input: Prompt: For the given order, return a JSON object that has the fields cheeseburger, hamburger, fries, or drink, with the value being the quantity. Order: A burger and a drink. Response: { "cheeseburger": 0, "hamburger": 1, "fries": 0, "drink": 1 } (gemini-2.5-flash) While the model did as prompted, writing out the instructions in natural language can sometimes be challenging and it leaves a lot to the model's interpretation. For example, a restaurants menu might contain many items. To reduce the size of the JSON response, you probably want to omit the items that weren't ordered. In this case, you can give an example and a response prefix and let the model complete it: Prompt: Valid fields are cheeseburger, hamburger, fries, and drink. Order: Give me a cheeseburger and fries Output: ``` { "cheeseburger": 1, "fries": 1 } ``` Order: I want two burgers, a drink, and fries. Output: Response: ``` { "hamburger": 2, "drink": 1, "fries": 1 } ``` (gemini-2.5-flash) Notice how \ No newline at end of file diff --git a/docstore/15395f7c-6542-4e7a-9eb3-deb86c0017a2 b/docstore/15395f7c-6542-4e7a-9eb3-deb86c0017a2 new file mode 100644 index 0000000000000000000000000000000000000000..a3fd9d3225fb67d0660508c87d747294298e3c33 --- /dev/null +++ b/docstore/15395f7c-6542-4e7a-9eb3-deb86c0017a2 @@ -0,0 +1 @@ +{ "uri" : "https://vertexaisearch.cloud.google.com....." , "title" : "uefa.com" }} ], "groundingSupports" : [ { "segment" : { "startIndex" : 0 , "endIndex" : 85 , "text" : "Spain won Euro 2024, defeatin..." }, "groundingChunkIndices" : [ 0 ] }, { "segment" : { "startIndex" : 86 , "endIndex" : 210 , "text" : "This victory marks Spain's..." }, "groundingChunkIndices" : [ 0 , 1 ] } ] } } ] } The Gemini API returns the following information with the groundingMetadata : webSearchQueries : Array of the search queries used. This is useful for debugging and understanding the model's reasoning process. searchEntryPoint : Contains the HTML and CSS to render the required Search Suggestions. Full usage requirements are detailed in the Terms of Service . groundingChunks : Array of objects containing the web sources ( uri and title ). groundingSupports : Array of chunks to connect model response text to the sources in groundingChunks . Each chunk links a text segment (defined by startIndex and endIndex ) to one or more groundingChunkIndices . This is the key to building inline citations. Grounding with Google Search can also be used in combination with the URL context tool to ground responses in both public web data and the specific URLs you provide. Attributing Sources with inline Citations The API returns structured citation data, giving you complete control over how you display sources in your user interface. You can use the groundingSupports and groundingChunks fields to link the model's statements directly to their sources. Here is a common pattern for processing the metadata to create a response with inline, clickable citations. Python def add_citations ( response ): text = response . text supports = response . candidates [ 0 ] . grounding_metadata . grounding_supports chunks = response . candidates [ 0 ] . grounding_metadata . grounding_chunks # Sort supports by end_index in descending order to avoid shifting issues when inserting. sorted_supports = sorted ( supports , key \ No newline at end of file diff --git a/docstore/1557dbe5-e7d9-4f9b-b573-cec9e8a3a8a2 b/docstore/1557dbe5-e7d9-4f9b-b573-cec9e8a3a8a2 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/1557dbe5-e7d9-4f9b-b573-cec9e8a3a8a2 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/156ed1f8-e978-4d38-ac61-5d519bc8f7b6 b/docstore/156ed1f8-e978-4d38-ac61-5d519bc8f7b6 new file mode 100644 index 0000000000000000000000000000000000000000..8639c839707c6139e726ba0ddc89bfd42831d7c1 --- /dev/null +++ b/docstore/156ed1f8-e978-4d38-ac61-5d519bc8f7b6 @@ -0,0 +1 @@ +POST \ -d '{ "contents": [{ "parts":[ {"file_data": {"mime_type": "application/pdf", "file_uri": ' $file_uri_1 '}}, {"file_data": {"mime_type": "application/pdf", "file_uri": ' $file_uri_2 '}}, {"text": "' $PROMPT '"} ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Technical details Gemini supports a maximum of 1,000 document pages. Each document page is equivalent to 258 tokens. While there are no specific limits to the number of pixels in a document besides the model's context window , larger pages are scaled down to a maximum resolution of 3072x3072 while preserving their original aspect ratio, while smaller pages are scaled up to 768x768 pixels. There is no cost reduction for pages at lower sizes, other than bandwidth, or performance improvement for pages at higher resolution. Document types Technically, you can pass other MIME types for document understanding, like TXT, Markdown, HTML, XML, etc. However, document vision only meaningfully understands PDFs . Other types will be extracted as pure text, and the model won't be able to interpret what we see in the rendering of those files. Any file-type specifics like charts, diagrams, HTML tags, Markdown formatting, etc., will be lost. Best practices For best results: Rotate pages to the correct orientation before uploading. Avoid blurry pages. If using a single page, place the text prompt after the page. What's next To learn more, see the following resources: File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers \ No newline at end of file diff --git a/docstore/157b40b4-79a7-40b4-89b4-207948f716c5 b/docstore/157b40b4-79a7-40b4-89b4-207948f716c5 new file mode 100644 index 0000000000000000000000000000000000000000..d35c152315aeb95511cf96871dd399a36b7f0b06 --- /dev/null +++ b/docstore/157b40b4-79a7-40b4-89b4-207948f716c5 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models#live-api Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/1586ddaa-f573-4d0c-b01e-cab69c9f0852 b/docstore/1586ddaa-f573-4d0c-b01e-cab69c9f0852 new file mode 100644 index 0000000000000000000000000000000000000000..150f8758ce4500c63fdc2d62f5bb812ca3b2d976 --- /dev/null +++ b/docstore/1586ddaa-f573-4d0c-b01e-cab69c9f0852 @@ -0,0 +1 @@ +client-side (browser based) applications // Consider using Ephemeral Tokens instead // More information at: https://ai.google.dev/gemini-api/docs/ephemeral-tokens // Half cascade model: // const model = "gemini-live-2.5-flash-preview" // Native audio output model: const model = "gemini-2.5-flash-preview-native-audio-dialog" const config = { responseModalities : [ Modality . AUDIO ], systemInstruction : "You are a helpful assistant and answer in a friendly tone." }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); \ No newline at end of file diff --git a/docstore/15cb6e92-ced9-48dc-9fdd-c178fe2651b8 b/docstore/15cb6e92-ced9-48dc-9fdd-c178fe2651b8 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/15cb6e92-ced9-48dc-9fdd-c178fe2651b8 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/15cfa42a-48af-4e1e-9a2c-9377aedc15d3 b/docstore/15cfa42a-48af-4e1e-9a2c-9377aedc15d3 new file mode 100644 index 0000000000000000000000000000000000000000..b63e6273e756a30fc7fe1bf2f7ffe65259b48047 --- /dev/null +++ b/docstore/15cfa42a-48af-4e1e-9a2c-9377aedc15d3 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/live#main-content Title: Get started with Live API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/15e3702f-ecd9-40d0-8fe6-fb36f092c199 b/docstore/15e3702f-ecd9-40d0-8fe6-fb36f092c199 new file mode 100644 index 0000000000000000000000000000000000000000..4ce25c45c0956235a2a76b8ce578fbaaad6010c8 --- /dev/null +++ b/docstore/15e3702f-ecd9-40d0-8fe6-fb36f092c199 @@ -0,0 +1 @@ +clip" , ]), }); console . log ( response . text ); } await main (); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } defer client . DeleteFile ( ctx , file . Name ) model := client . GenerativeModel ( "gemini-2.0-flash" ) resp , err := model . GenerateContent ( ctx , genai . FileData { URI : file . URI }, genai . Text ( "Describe this audio clip" )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) REST AUDIO_PATH = "path/to/sample.mp3" MIME_TYPE = $( file -b --mime-type " ${ AUDIO_PATH } " ) NUM_BYTES = $( wc -c < " ${ AUDIO_PATH } " ) DISPLAY_NAME = AUDIO tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D " ${ tmp_header_file } " \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ AUDIO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Describe this audio clip"}, {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": \ No newline at end of file diff --git a/docstore/15e4495f-0655-419f-af10-15ffa33839f2 b/docstore/15e4495f-0655-419f-af10-15ffa33839f2 new file mode 100644 index 0000000000000000000000000000000000000000..70023e78d7b8c9459310371555beec5ba6328e28 --- /dev/null +++ b/docstore/15e4495f-0655-419f-af10-15ffa33839f2 @@ -0,0 +1 @@ +results reflects a single function call that the model has requested. To send the results back, include the responses in the same order as they were requested. The Python SDK supports automatic function calling , which automatically converts Python functions to declarations, handles the function call execution and response cycle for you. Following is an example for the disco use case. Note: Automatic Function Calling is a Python SDK only feature at the moment. Python from google import genai from google.genai import types # Actual function implementations def power_disco_ball_impl ( power : bool ) - > dict : """Powers the spinning disco ball. Args: power: Whether to turn the disco ball on or off. Returns: A status dictionary indicating the current state. """ return { "status" : f "Disco ball powered { 'on' if power else 'off' } " } def start_music_impl ( energetic : bool , loud : bool ) - > dict : """Play some music matching the specified parameters. Args: energetic: Whether the music is energetic or not. loud: Whether the music is loud or not. Returns: A dictionary containing the music settings. """ music_type = "energetic" if energetic else "chill" volume = "loud" if loud else "quiet" return { "music_type" : music_type , "volume" : volume } def dim_lights_impl ( brightness : float ) - > dict : """Dim the lights. Args: brightness: The brightness of the lights, 0.0 is off, 1.0 is full. Returns: A dictionary containing the new brightness setting. """ return { "brightness" : brightness } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ power_disco_ball_impl , start_music_impl , dim_lights_impl ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Do everything you need to this place into party!" , config = config , ) print ( " \n Example 2: Automatic function calling" ) print ( response . text ) # I've turned on the disco ball, started playing loud and \ No newline at end of file diff --git a/docstore/15e6794a-93b6-4cca-a36c-b0e18a97849b b/docstore/15e6794a-93b6-4cca-a36c-b0e18a97849b new file mode 100644 index 0000000000000000000000000000000000000000..f3ee6bd618c8a96cb85f673bde09f147834617b9 --- /dev/null +++ b/docstore/15e6794a-93b6-4cca-a36c-b0e18a97849b @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/speech-generation#main-content Title: Speech generation (text-to-speech) | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/15f41a22-199f-43fc-a5c9-dcfa483b9eb9 b/docstore/15f41a22-199f-43fc-a5c9-dcfa483b9eb9 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/15f41a22-199f-43fc-a5c9-dcfa483b9eb9 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/1625112a-a93d-4cdd-956f-222e6503c82b b/docstore/1625112a-a93d-4cdd-956f-222e6503c82b new file mode 100644 index 0000000000000000000000000000000000000000..4c85feaa3736483a30d5e5d588476e1ba56d6f32 --- /dev/null +++ b/docstore/1625112a-a93d-4cdd-956f-222e6503c82b @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs Title: Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/162b6d82-919a-4462-9e1c-b16ad3566a32 b/docstore/162b6d82-919a-4462-9e1c-b16ad3566a32 new file mode 100644 index 0000000000000000000000000000000000000000..5b23b75839f7d9f5e86c0814ceb13216aba4c820 --- /dev/null +++ b/docstore/162b6d82-919a-4462-9e1c-b16ad3566a32 @@ -0,0 +1 @@ +Using Gemini API keys | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Using Gemini API keys To use the Gemini API, you need an API key. You can create a key for free with a few clicks in Google AI Studio . Once you have an API key, you have the following options to connect to the Gemini API: Setting your API key as an environment variable Providing your API key explicitly For initial testing, you can hard code an API key, but this should only be temporary since it's not secure. You can find examples for hard coding the API key in Providing API key explicitly section. Setting API key as environment variable If you set the environment variable GEMINI_API_KEY or GOOGLE_API_KEY , the API key will automatically be picked up by the client when using one of the Gemini API libraries . It's recommended that you set only one of those variables, but if both are set, GOOGLE_API_KEY takes precedence. If you're using the REST API, or JavaScript on the browser, you will need to provide the API key explicitly. Here is how you can set your API key locally as the environment variable GEMINI_API_KEY with different operating systems. Linux/macOS - Bash Bash is a common Linux and macOS terminal configuration. You can check if you have a configuration file for it by running the following command: ~/.bashrc If the response is "No such file or directory", you will need to create this file and open it by running the following commands, or use zsh : touch ~/.bashrc open ~/.bashrc Next, you need to set your API key by adding the following export command: export GEMINI_API_KEY = After saving the file, apply the changes by running: source ~/.bashrc macOS \ No newline at end of file diff --git a/docstore/165829c7-ae6f-4961-addb-d362642ae021 b/docstore/165829c7-ae6f-4961-addb-d362642ae021 new file mode 100644 index 0000000000000000000000000000000000000000..ed09bf86b4b3896290a2372bddef4006c085c60d --- /dev/null +++ b/docstore/165829c7-ae6f-4961-addb-d362642ae021 @@ -0,0 +1 @@ +Image generation | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image generation You can generate images using the Gemini API with either Gemini's built-in multimodal capabilities or Imagen, Google's specialized image generation models. For most use cases, start with Gemini . Choose Imagen for specialized tasks where image quality is critical. See Choosing the right model section for more guidance. All generated images include a SynthID watermark . Before you begin Ensure you use a supported model and version for image generation: For Gemini , use Gemini 2.0 Flash Preview Image Generation. For Imagen , use one of the Imagen models (Imagen 3, Imagen 4 or Imagen 4 Ultra). Note that those models are only available on the Paid tier . You can access both Gemini and Imagen models using the same libraries. Note: Image generation may not be available in all regions and countries, review our Models page for more information. Generate images using Gemini Gemini can generate and process images conversationally. You can prompt Gemini with text, images, or a combination of both to achieve various image-related tasks, such as image generation and editing. You must include responseModalities : ["TEXT", "IMAGE"] in your configuration. Image-only output is not supported with these models. Image generation (text-to-image) The following code demonstrates how to generate an image based on a descriptive prompt: Python from google import genai from google.genai import types from PIL import Image from io import BytesIO import base64 client = genai . Client () contents = ( 'Hi, can you create a 3d rendered image of a pig ' 'with wings and a top hat flying \ No newline at end of file diff --git a/docstore/165b661d-5f30-4653-8aa5-056942f435a8 b/docstore/165b661d-5f30-4653-8aa5-056942f435a8 new file mode 100644 index 0000000000000000000000000000000000000000..7dc87b548e2d57526821a9c12df5e47c7e7e0e83 --- /dev/null +++ b/docstore/165b661d-5f30-4653-8aa5-056942f435a8 @@ -0,0 +1 @@ +. thoughtsTokenCount } ` ); console . log ( `Output tokens: ${ response . usageMetadata . candidatesTokenCount } ` ); Go // ... usageMetadata , err := json . MarshalIndent ( response . UsageMetadata , "" , " " ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Thoughts tokens:" , string ( usageMetadata . thoughts_token_count )) fmt . Println ( "Output tokens:" , string ( usageMetadata . candidates_token_count )) Thinking models generate full thoughts to improve the quality of the final response, and then output summaries to provide insight into the thought process. So, pricing is based on the full thought tokens the model needs to generate to create a summary, despite only the summary being output from the API. You can learn more about tokens in the Token counting guide. Supported models Thinking features are supported on all the 2.5 series models. You can find all model capabilities on the model overview page. Best practices This section includes some guidance for using thinking models efficiently. As always, following our prompting guidance and best practices will get you the best results. Debugging and steering Review reasoning : When you're not getting your expected response from the thinking models, it can help to carefully analyze Gemini's thought summaries. You can see how it broke down the task and arrived at its conclusion, and use that information to correct towards the right results. Provide Guidance in Reasoning : If you're hoping for a particularly lengthy output, you may want to provide guidance in your prompt to constrain the amount of thinking the model uses. This lets you reserve more of the token output for your response. Task complexity Easy Tasks (Thinking could be OFF): For straightforward requests where complex reasoning isn't required, such as fact retrieval or classification, thinking is not required. Examples include: "Where was DeepMind founded?" "Is this email asking for a meeting or just providing information?" Medium Tasks \ No newline at end of file diff --git a/docstore/166d10d1-dbe0-434f-9089-a2d5e9d5626c b/docstore/166d10d1-dbe0-434f-9089-a2d5e9d5626c new file mode 100644 index 0000000000000000000000000000000000000000..84742c0b7906ca5a168857eb7577a7e191bcdffb --- /dev/null +++ b/docstore/166d10d1-dbe0-434f-9089-a2d5e9d5626c @@ -0,0 +1 @@ +prompt - urban background, man-made structures, dark, stormy, or threatening atmosphere. Aspect ratios Gemini Veo video generation supports the following two aspect ratios: Aspect ratio Description Widescreen or 16:9 The most common aspect ratio for televisions, monitors, and mobile phone screens (landscape). Use this when you want to capture more of the background, like in scenic landscapes. Portrait or 9:16 Rotated widescreen. This aspect ratio has been popularized by short form video applications, such as Youtube shorts. Use this for portraits or tall objects with strong vertical orientations, such as buildings, trees, waterfall, or buildings. Widescreen This prompt is an example of the widescreen aspect ratio of 16:9. Prompt Generated output Create a video with a tracking drone view of a man driving a red convertible car in Palm Springs, 1970s, warm sunlight, long shadows. Portrait This prompt is an example of the portrait aspect ratio of 9:16. Prompt Generated output Create a video highlighting the smooth motion of a majestic Hawaiian waterfall within a lush rainforest. Focus on realistic water flow, detailed foliage, and natural lighting to convey tranquility. Capture the rushing water, misty atmosphere, and dappled sunlight filtering through the dense canopy. Use smooth, cinematic camera movements to showcase the waterfall and its surroundings. Aim for a peaceful, realistic tone, transporting the viewer to the serene beauty of the Hawaiian rainforest. What's next Gain more experience generating AI videos with the Veo Colab . Check out cool examples using Veo 2 on the Google DeepMind site Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/16772059-3ebd-4357-8e10-20e8dfd725d1 b/docstore/16772059-3ebd-4357-8e10-20e8dfd725d1 new file mode 100644 index 0000000000000000000000000000000000000000..ca0d44e9961a5c85cb8aeb7b443a141c9784fb0d --- /dev/null +++ b/docstore/16772059-3ebd-4357-8e10-20e8dfd725d1 @@ -0,0 +1 @@ +Image understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image understanding Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models. Tip: In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer improved accuracy for specific use cases like object detection and segmentation , through additional training. See the Capabilities section for more details. Passing images to Gemini You can provide images as input to Gemini using two methods: Passing inline image data : Ideal for smaller files (total request size less than 20MB, including prompts). Uploading images using the File API : Recommended for larger files or for reusing images across multiple requests. Passing inline image data You can pass inline image data in the request to generateContent . You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language). The following example shows how to read an image from a local file and pass it to generateContent API for processing. Python from google.genai import types with open ( 'path/to/small-sample.jpg' , 'rb' ) as f : image_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ types . Part . from_bytes ( data = image_bytes , mime_type = 'image/jpeg' , ), 'Caption this image.' ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs \ No newline at end of file diff --git a/docstore/1677ee29-7eaa-4d8a-9051-17c3e649bc0b b/docstore/1677ee29-7eaa-4d8a-9051-17c3e649bc0b new file mode 100644 index 0000000000000000000000000000000000000000..4698c2cf5d2dc524303259a813fe032a26136eee --- /dev/null +++ b/docstore/1677ee29-7eaa-4d8a-9051-17c3e649bc0b @@ -0,0 +1 @@ +blurring the background into a sea of neon colors and indistinct shadows, creating a sense of urgency and isolation. A more detailed prompt results in a video that is more focused with a richer environment. A video with smooth motion that dollies in on a desperate man in a green trench coat, using a vintage rotary phone against a wall bathed in an eerie green neon glow. The camera starts from a medium distance, slowly moving closer to the man's face, revealing his frantic expression and the sweat on his brow as he urgently dials the phone. The focus is on the man's hands, his fingers fumbling with the dial as he desperately tries to connect. The green neon light casts long shadows on the wall, adding to the tense atmosphere. The scene is framed to emphasize the isolation and desperation of the man, highlighting the stark contrast between the vibrant glow of the neon and the man's grim determination. Adding more detail gives the subject a realistic expression and creates an intense and vibrant scene. Snow leopard This example demonstrates the output Veo might generate for a simple prompt. Prompt Generated output A cute creature with snow leopard-like fur is walking in winter forest, 3D cartoon style render. Running snow leopard This prompt has more detail and demonstrates generated output that might be closer to what you want in your video. Prompt Generated output Create a short 3D animated scene in a joyful cartoon style. A cute creature with snow leopard-like fur, large expressive eyes, and a friendly, rounded form happily prances through a whimsical winter forest. The scene should feature rounded, snow-covered trees, gentle falling snowflakes, and warm sunlight filtering through the branches. The creature's bouncy movements and wide smile should convey pure delight. Aim for an upbeat, heartwarming tone with bright, cheerful colors and playful animation. Examples by writing elements These examples show you how to refine your prompts by each basic element. Subject \ No newline at end of file diff --git a/docstore/167ea75a-0fc0-4f5c-b18d-0115207e0a13 b/docstore/167ea75a-0fc0-4f5c-b18d-0115207e0a13 new file mode 100644 index 0000000000000000000000000000000000000000..87bffe59c9f4797f4d0611e74f88436457c136bc --- /dev/null +++ b/docstore/167ea75a-0fc0-4f5c-b18d-0115207e0a13 @@ -0,0 +1 @@ +unlock new use cases. Some emerging and standard use cases for text based long context include: Summarizing large corpuses of text Previous summarization options with smaller context models would require a sliding window or another technique to keep state of previous sections as new tokens are passed to the model Question and answering Historically this was only possible with RAG given the limited amount of context and models' factual recall being low Agentic workflows Text is the underpinning of how agents keep state of what they have done and what they need to do; not having enough information about the world and the agent's goal is a limitation on the reliability of agents Many-shot in-context learning is one of the most unique capabilities unlocked by long context models. Research has shown that taking the common "single shot" or "multi-shot" example paradigm, where the model is presented with one or a few examples of a task, and scaling that up to hundreds, thousands, or even hundreds of thousands of examples, can lead to novel model capabilities. This many-shot approach has also been shown to perform similarly to models which were fine-tuned for a specific task. For use cases where a Gemini model's performance is not yet sufficient for a production rollout, you can try the many-shot approach. As you might explore later in the long context optimization section, context caching makes this type of high input token workload much more economically feasible and even lower latency in some cases. Long form video Video content's utility has long been constrained by the lack of accessibility of the medium itself. It was hard to skim the content, transcripts often failed to capture the nuance of a video, and most tools don't process image, text, and audio together. With Gemini, the long-context text capabilities translate to the ability to reason and answer questions about multimodal inputs with sustained performance. Some emerging and standard use cases for video long \ No newline at end of file diff --git a/docstore/168277f0-bcf3-422e-9a64-f80be3e262f1 b/docstore/168277f0-bcf3-422e-9a64-f80be3e262f1 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/168277f0-bcf3-422e-9a64-f80be3e262f1 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/16c8f84f-e66c-4500-9936-b0c9a4c2850f b/docstore/16c8f84f-e66c-4500-9936-b0c9a4c2850f new file mode 100644 index 0000000000000000000000000000000000000000..a8bbf59ddedb6c7213df1bd811fe6d2ce8316cfa --- /dev/null +++ b/docstore/16c8f84f-e66c-4500-9936-b0c9a4c2850f @@ -0,0 +1 @@ +. live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-exp-native-audio-thinking-dialog' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Voice Activity Detection (VAD) Voice Activity Detection (VAD) allows the model to recognize when a person is speaking. This is essential for creating natural conversations, as it allows a user to interrupt the model at any time. When VAD detects an interruption, the ongoing generation is canceled and discarded. Only the information already sent to the client is retained in the session history. The server then sends a BidiGenerateContentServerContent message to report the interruption. The Gemini server then discards any pending function calls and sends a BidiGenerateContentServerContent message with the IDs of the canceled calls. Python async for response in session . receive (): if response . server_content . interrupted is True : # The generation was interrupted # If realtime playback is implemented in your application, # you should stop playing audio and clear queued playback here. JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . interrupted ) { // The generation was interrupted // If realtime playback is implemented in your application, // you should stop playing audio and clear queued playback here. } } Automatic VAD By default, the model automatically performs VAD on a continuous audio input stream. VAD can be configured with the realtimeInputConfig.automaticActivityDetection field of the setup configuration . When the audio stream is paused for more than a second (for example, because the user switched off the microphone), an audioStreamEnd event \ No newline at end of file diff --git a/docstore/16cdd521-3a28-4304-9b7a-0415fe107fa6 b/docstore/16cdd521-3a28-4304-9b7a-0415fe107fa6 new file mode 100644 index 0000000000000000000000000000000000000000..a1f30e86f8de69da772a1b833567cf406e31d0a4 --- /dev/null +++ b/docstore/16cdd521-3a28-4304-9b7a-0415fe107fa6 @@ -0,0 +1 @@ +temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . It 's 25°C in London, so I' ve set the thermostat to 20 ° C . Compositional function calling is a native Live API feature. This means Live API can handle the function calling similar to the Python SDK. Python # Light control schemas turn_on_the_lights_schema = { 'name' : 'turn_on_the_lights' } turn_off_the_lights_schema = { 'name' : 'turn_off_the_lights' } prompt = """ Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? """ tools = [ { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} ] await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Light control schemas const turnOnTheLightsSchema = { name : 'turn_on_the_lights' }; const turnOffTheLightsSchema = { name : 'turn_off_the_lights' }; const prompt = ` Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? ` ; const tools = [ { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } ]; await run ( prompt , tools = tools , modality = "AUDIO" ) Function calling modes The Gemini API lets you control how the model uses the provided tools (function declarations). Specifically, you can set the mode within the. function_calling_config . AUTO (Default) : The model decides whether to generate a natural language response or suggest a function call based on the prompt and context. This is the most flexible mode and recommended for most scenarios. ANY : The model is constrained to always predict a function call and guarantees function schema adherence. If allowed_function_names is \ No newline at end of file diff --git a/docstore/16e59503-a3e1-4a3f-8383-f13ec344b482 b/docstore/16e59503-a3e1-4a3f-8383-f13ec344b482 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/16e59503-a3e1-4a3f-8383-f13ec344b482 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/16fff0f6-5f86-4cab-8ccd-bcf53a950df7 b/docstore/16fff0f6-5f86-4cab-8ccd-bcf53a950df7 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/16fff0f6-5f86-4cab-8ccd-bcf53a950df7 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/1715f999-3ca8-4982-86b6-e5531e29517d b/docstore/1715f999-3ca8-4982-86b6-e5531e29517d new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/1715f999-3ca8-4982-86b6-e5531e29517d @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/17275114-de87-40e8-b5f1-b7b8367fbf43 b/docstore/17275114-de87-40e8-b5f1-b7b8367fbf43 new file mode 100644 index 0000000000000000000000000000000000000000..6e71e94222e9c44768c28e09ebada72b5ff1e76f --- /dev/null +++ b/docstore/17275114-de87-40e8-b5f1-b7b8367fbf43 @@ -0,0 +1 @@ +writeFileSync ( `imagen- ${ idx } .png` , buffer ); idx ++ ; } } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateImagesConfig { NumberOfImages : 4 , } response , _ := client . Models . GenerateImages ( ctx , "imagen-4.0-generate-preview-06-06" , "Robot holding a red skateboard" , config , ) for n , image := range response . GeneratedImages { fname := fmt . Sprintf ( "imagen-%d.png" , n ) _ = os . WriteFile ( fname , image . Image . ImageBytes , 0644 ) } } REST curl -X POST \ "https://generativelanguage.googleapis.com/v1beta/models/imagen-4.0-generate-preview-06-06:predict" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -d '{ "instances": [ { "prompt": "Robot holding a red skateboard" } ], "parameters": { "sampleCount": 4 } }' AI-generated image of a robot holding a red skateboard Imagen configuration Imagen supports English only prompts at this time and the following parameters: Note: Naming conventions of parameters vary by programming language. numberOfImages : The number of images to generate, from 1 to 4 (inclusive). The default is 4. For Imagen 4 Ultra, it defaults to 1 as only one image can be generated at a time. aspectRatio : Changes the aspect ratio of the generated image. Supported values are "1:1" , "3:4" , "4:3" , "9:16" , and "16:9" . The default is "1:1" . personGeneration : Allow the model to generate images of people. The following values are supported: "dont_allow" : Block generation of images of people. "allow_adult" : Generate images of adults, but not children. This is the default. "allow_all" : Generate images that include adults and children. Note: The "allow_all" parameter value is not allowed in EU, UK, CH, MENA locations. Choosing the right model Choose Gemini when: You need contextually relevant images that leverage \ No newline at end of file diff --git a/docstore/174997e7-586c-4c14-a0f7-f7b7a55dc4c2 b/docstore/174997e7-586c-4c14-a0f7-f7b7a55dc4c2 new file mode 100644 index 0000000000000000000000000000000000000000..b9aae1f02a8caa7a25135d3bec800921c05dfc11 --- /dev/null +++ b/docstore/174997e7-586c-4c14-a0f7-f7b7a55dc4c2 @@ -0,0 +1 @@ +( response . choices [ 0 ] . message . content ) JavaScript import fs from "fs" ; import OpenAI from "openai" ; const client = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); const audioFile = fs . readFileSync ( "/path/to/your/audio/file.wav" ); const base64Audio = Buffer . from ( audioFile ). toString ( "base64" ); async function main () { const response = await client . chat . completions . create ({ model : "gemini-2.0-flash" , messages : [ { role : "user" , content : [ { type : "text" , text : "Transcribe this audio" , }, { type : "input_audio" , input_audio : { data : base64Audio , format : "wav" , }, }, ], }, ], }); console . log ( response . choices [ 0 ]. message . content ); } main (); REST Note: If you get an Argument list too long error, the encoding of your audio file might be too long for curl. bash -c ' base64_audio=$(base64 -i "/path/to/your/audio/file.wav"); curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d "{ \"model\": \"gemini-2.0-flash\", \"messages\": [ { \"role\": \"user\", \"content\": [ { \"type\": \"text\", \"text\": \"Transcribe this audio file.\" }, { \"type\": \"input_audio\", \"input_audio\": { \"data\": \"${base64_audio}\", \"format\": \"wav\" } } ] } ] }" ' Structured output Gemini models can output JSON objects in any structure you define . Python from pydantic import BaseModel from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) class CalendarEvent ( BaseModel ): name : str date : str participants : list [ str ] completion = client . beta . chat . completions . parse ( model = "gemini-2.0-flash" , messages = [ { "role" : "system" , "content" : "Extract the event information." }, { "role" : "user" , "content" : "John and Susan are going to an AI conference on \ No newline at end of file diff --git a/docstore/176acfca-0d60-4d03-892f-09bf2504ccd3 b/docstore/176acfca-0d60-4d03-892f-09bf2504ccd3 new file mode 100644 index 0000000000000000000000000000000000000000..46b1ab716068a90ca8b9aaaffe42e5334bcea2c0 --- /dev/null +++ b/docstore/176acfca-0d60-4d03-892f-09bf2504ccd3 @@ -0,0 +1 @@ +Batch Mode | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Batch Mode The Gemini API's Batch Mode is designed to process large volumes of requests asynchronously at 50% of the standard cost . The target turnaround time is 24 hours, but in majority of cases, it is much quicker. Use Batch Mode for large-scale, non-urgent tasks such as data pre-processing or running evaluations where an immediate response is not required. Note: You can use Batch Mode with the Gemini API Python SDK or the REST API. Support for Batch Mode in the Gemini API JavaScript SDK is coming soon. Getting Started This section helps you get started with submitting your first requests in batch mode. Creating a batch job You have two ways to submit your requests in Batch Mode: Inline Requests : A list of GenerateContentRequest objects directly included in your batch creation request. This is suitable for smaller batches that keep the total request size under 20MB. The output returned from the model is a list of inlineResponse objects. Input File : A JSON Lines (JSONL) file where each line contains a complete GenerateContentRequest object. This method is recommended for larger requests. The output returned from the model is a JSONL file where each line is either a GenerateContentResponse or a status object. Inline requests For a small number of requests, you can directly embed the GenerateContentRequest objects within your BatchGenerateContentRequest . The following example calls the BatchGenerateContent method with inline requests: Python from google import genai from google.genai import types client = genai . Client () # A list of dictionaries, where each is a \ No newline at end of file diff --git a/docstore/17700f5c-26dc-415f-9b67-a6feb77ffda9 b/docstore/17700f5c-26dc-415f-9b67-a6feb77ffda9 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/17700f5c-26dc-415f-9b67-a6feb77ffda9 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/17741ac6-ba67-4a08-b9db-de5244b95287 b/docstore/17741ac6-ba67-4a08-b9db-de5244b95287 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/17741ac6-ba67-4a08-b9db-de5244b95287 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/1775594d-a06c-4d12-afa7-19e2598c1b9d b/docstore/1775594d-a06c-4d12-afa7-19e2598c1b9d new file mode 100644 index 0000000000000000000000000000000000000000..7a617ceacc5e968d9729ffe6ff8f1e15b90d626d --- /dev/null +++ b/docstore/1775594d-a06c-4d12-afa7-19e2598c1b9d @@ -0,0 +1 @@ +multiple attempts yield the best results. Keep it short : Limit text to 25 characters or less for optimal generation. Multiple phrases : Experiment with two or three distinct phrases to provide additional information. Avoid exceeding three phrases for cleaner compositions. Prompt: A poster with the text "Summerland" in bold font as a title, underneath this text is the slogan "Summer never felt so good" Guide Placement : While Imagen can attempt to position text as directed, expect occasional variations. This feature is continually improving. Inspire font style : Specify a general font style to subtly influence Imagen's choices. Don't rely on precise font replication, but expect creative interpretations. Font size : Specify a font size or a general indication of size (for example, small , medium , large ) to influence the font size generation. Prompt parameterization To better control output results, you might find it helpful to parameterize the inputs into Imagen. For example, suppose you want your customers to be able to generate logos for their business, and you want to make sure logos are always generated on a solid color background. You also want to limit the options that the client can select from a menu. In this example, you can create a parameterized prompt similar to the following: A {logo_style} logo for a {company_area} company on a solid color background. Include the text {company_name} . In your custom user interface, the customer can input the parameters using a menu, and their chosen value populates the prompt Imagen receives. For example: Prompt: A minimalist logo for a health care company on a solid color background. Include the text Journey . Prompt: A modern logo for a software company on a solid color background. Include the text Silo . Prompt: A traditional logo for a baking company on a solid color background. Include the text Seed . Advanced prompt writing techniques Use the following examples to create more specific prompts based on attributes \ No newline at end of file diff --git a/docstore/17aaa2d3-0372-4d50-ac6d-4b481d74bd1d b/docstore/17aaa2d3-0372-4d50-ac6d-4b481d74bd1d new file mode 100644 index 0000000000000000000000000000000000000000..ca0d44e9961a5c85cb8aeb7b443a141c9784fb0d --- /dev/null +++ b/docstore/17aaa2d3-0372-4d50-ac6d-4b481d74bd1d @@ -0,0 +1 @@ +Image understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image understanding Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models. Tip: In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer improved accuracy for specific use cases like object detection and segmentation , through additional training. See the Capabilities section for more details. Passing images to Gemini You can provide images as input to Gemini using two methods: Passing inline image data : Ideal for smaller files (total request size less than 20MB, including prompts). Uploading images using the File API : Recommended for larger files or for reusing images across multiple requests. Passing inline image data You can pass inline image data in the request to generateContent . You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language). The following example shows how to read an image from a local file and pass it to generateContent API for processing. Python from google.genai import types with open ( 'path/to/small-sample.jpg' , 'rb' ) as f : image_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ types . Part . from_bytes ( data = image_bytes , mime_type = 'image/jpeg' , ), 'Caption this image.' ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs \ No newline at end of file diff --git a/docstore/17ac2f26-d891-45cf-a49e-8c42d1e00eaa b/docstore/17ac2f26-d891-45cf-a49e-8c42d1e00eaa new file mode 100644 index 0000000000000000000000000000000000000000..af04e93a60e88c829b3aa07a575f51b8b1fa9ccf --- /dev/null +++ b/docstore/17ac2f26-d891-45cf-a49e-8c42d1e00eaa @@ -0,0 +1 @@ +TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : false , // default startOfSpeechSensitivity : StartSensitivity . START_SENSITIVITY_LOW , endOfSpeechSensitivity : EndSensitivity . END_SENSITIVITY_LOW , prefixPaddingMs : 20 , silenceDurationMs : 100 , } } }; Disable automatic VAD Alternatively, the automatic VAD can be disabled by setting realtimeInputConfig.automaticActivityDetection.disabled to true in the setup message. In this configuration the client is responsible for detecting user speech and sending activityStart and activityEnd messages at the appropriate times. An audioStreamEnd isn't sent in this configuration. Instead, any interruption of the stream is marked by an activityEnd message. Python config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : True }}, } async with client . aio . live . connect ( model = model , config = config ) as session : # ... await session . send_realtime_input ( activity_start = types . ActivityStart ()) await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) await session . send_realtime_input ( activity_end = types . ActivityEnd ()) # ... JavaScript const config = { responseModalities : [ Modality . TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : true , } } }; session . sendRealtimeInput ({ activityStart : {} }) session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); session . sendRealtimeInput ({ activityEnd : {} }) Token count You can find the total number of consumed tokens in the usageMetadata field of the returned server message. Python async for message in session . receive (): # The server will periodically send messages that include UsageMetadata. if message . usage_metadata : usage = message . usage_metadata print ( f "Used { usage . total_token_count } tokens in total. Response token \ No newline at end of file diff --git a/docstore/17ac80f4-b0df-48e2-a439-48c293fab3cf b/docstore/17ac80f4-b0df-48e2-a439-48c293fab3cf new file mode 100644 index 0000000000000000000000000000000000000000..4403c8e8ebca16251f4875b8e14907f4412efbd1 --- /dev/null +++ b/docstore/17ac80f4-b0df-48e2-a439-48c293fab3cf @@ -0,0 +1 @@ +"role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Hello!" } ], stream = True ) for chunk in response : print ( chunk . choices [ 0 ] . delta ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function main () { const completion = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : [ { "role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Hello!" } ], stream : true , }); for await ( const chunk of completion ) { console . log ( chunk . choices [ 0 ]. delta . content ); } } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.0-flash", "messages": [ {"role": "user", "content": "Explain to me how AI works"} ], "stream": true }' Function calling Function calling makes it easier for you to get structured data outputs from generative models and is supported in the Gemini API . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) tools = [ { "type" : "function" , "function" : { "name" : "get_weather" , "description" : "Get the weather in a given location" , "parameters" : { "type" : "object" , "properties" : { "location" : { "type" : "string" , "description" : "The city and state, e.g. Chicago, IL" , }, "unit" : { "type" : "string" , "enum" : [ "celsius" , "fahrenheit" ]}, }, "required" : [ "location" ], }, } } ] messages = [{ "role" : "user" , "content" : "What's the weather like in Chicago today?" }] response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = messages , tools = tools , tool_choice = "auto" ) print ( response ) JavaScript import \ No newline at end of file diff --git a/docstore/17b37ed8-bf62-4da0-93c0-6abd72367d96 b/docstore/17b37ed8-bf62-4da0-93c0-6abd72367d96 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/17b37ed8-bf62-4da0-93c0-6abd72367d96 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/17c406c0-01ea-4250-b786-fe281726be17 b/docstore/17c406c0-01ea-4250-b786-fe281726be17 new file mode 100644 index 0000000000000000000000000000000000000000..ddb7a9245d74e27120ebf722d781e6ffdbe95888 --- /dev/null +++ b/docstore/17c406c0-01ea-4250-b786-fe281726be17 @@ -0,0 +1 @@ +"gemini-1.5-flash" ) imgData , err := os . ReadFile ( "path/to/organ.jpg" ) if err != nil { log . Fatal ( err ) } resp , err := model . GenerateContent ( ctx , genai . Text ( "Tell me about this instrument" ), genai . ImageData ( "jpeg" , imgData )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) // utility for printing response After Python Many of the same convenience features exist in the new SDK. For example, PIL.Image objects are automatically converted. from google import genai from PIL import Image client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = [ 'Tell me a story based on this image' , Image . open ( image_path ) ] ) print ( response . text ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const organ = await ai . files . upload ({ file : "path/to/organ.jpg" , }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : [ createUserContent ([ "Tell me a story based on this image" , createPartFromUri ( organ . uri , organ . mimeType ) ]), ], }); console . log ( response . text ); Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imgData , err := os . ReadFile ( "path/to/organ.jpg" ) if err != nil { log . Fatal ( err ) } parts := [] * genai . Part { { Text : "Tell me a story based on this image" }, { InlineData : & genai . Blob { Data : imgData , MIMEType : "image/jpeg" }}, } contents := [] * genai . Content { { Parts : parts }, } result , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , contents , nil ) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing result Streaming Before Python import google.generativeai as genai response = model . generate_content ( "Write a cute story about cats." , stream = True ) for chunk in response : print ( chunk . text ) \ No newline at end of file diff --git a/docstore/17c795fc-a0be-4f6a-9a2d-594c051a3bdb b/docstore/17c795fc-a0be-4f6a-9a2d-594c051a3bdb new file mode 100644 index 0000000000000000000000000000000000000000..4a8a7222dfc27acfaa73b21a084913914a78851b --- /dev/null +++ b/docstore/17c795fc-a0be-4f6a-9a2d-594c051a3bdb @@ -0,0 +1 @@ +"fmt" "google.golang.org/genai" "os" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := genai . Text ( "What is the sum of the first 50 prime numbers?" ) model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { IncludeThoughts : true , }, }) for _ , part := range resp . Candidates [ 0 ]. Content . Parts { if part . Text != "" { if part . Thought { fmt . Println ( "Thoughts Summary:" ) fmt . Println ( part . Text ) } else { fmt . Println ( "Answer:" ) fmt . Println ( part . Text ) } } } } And here is an example using thinking with streaming, which returns rolling, incremental summaries during generation: Python from google import genai from google.genai import types client = genai . Client () prompt = """ Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own? """ thoughts = "" answer = "" for chunk in client . models . generate_content_stream ( model = "gemini-2.5-pro" , contents = prompt , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( include_thoughts = True ) ) ): for part in chunk . candidates [ 0 ] . content . parts : if not part . text : continue elif part . thought : if not thoughts : print ( "Thoughts summary:" ) print ( part . text ) thoughts += part . text else : if not answer : print ( "Answer:" ) print ( part . text ) answer += part . text JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const prompt = `Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The \ No newline at end of file diff --git a/docstore/17e95acd-8cdc-4b43-8fc3-2d80eb716751 b/docstore/17e95acd-8cdc-4b43-8fc3-2d80eb716751 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/17e95acd-8cdc-4b43-8fc3-2d80eb716751 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/17f59526-ef74-40e3-9d39-30390d0ea131 b/docstore/17f59526-ef74-40e3-9d39-30390d0ea131 new file mode 100644 index 0000000000000000000000000000000000000000..d73a03ac64bf52901f07bf0a8fe4fc21e47f6048 --- /dev/null +++ b/docstore/17f59526-ef74-40e3-9d39-30390d0ea131 @@ -0,0 +1 @@ +are used in a variety of common AI use cases, such as: Information retrieval: You can use embeddings to retrieve semantically similar text given a piece of input text. Document search tutorial task Clustering: Comparing groups of embeddings can help identify hidden trends. Embedding clustering tutorial bubble_chart Vector database: As you take different embedding use cases to production, it is common to store embeddings in a vector database. Vector database tutorial bolt Classification: You can train a model using embeddings to classify documents into categories. Classification tutorial token Embedding models The Gemini API offers three models that generate text embeddings: gemini-embedding-exp-03-07 text-embedding-004 embedding-001 What's next Check out the embeddings quickstart notebook . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/18063a0b-5df9-4d29-969d-faf5060e40e5 b/docstore/18063a0b-5df9-4d29-969d-faf5060e40e5 new file mode 100644 index 0000000000000000000000000000000000000000..bebef94a2670f1eebe768f4cb4680db515bfe743 --- /dev/null +++ b/docstore/18063a0b-5df9-4d29-969d-faf5060e40e5 @@ -0,0 +1 @@ +it's recommended to provide a single message summary to free up the context window for subsequent interactions. See Session Resumption for another method for loading session context. Sending and receiving audio The most common audio example, audio-to-audio , is covered in the Getting started guide. Here's an audio-to-text example that reads a WAV file, sends it in the correct format and receives text output: Python # Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav # Install helpers for converting files: pip install librosa soundfile import asyncio import io from pathlib import Path from google import genai from google.genai import types import soundfile as sf import librosa client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : buffer = io . BytesIO () y , sr = librosa . load ( "sample.wav" , sr = 16000 ) sf . write ( buffer , y , sr , format = 'RAW' , subtype = 'PCM_16' ) buffer . seek ( 0 ) audio_bytes = buffer . read () # If already in correct format, you can use this: # audio_bytes = Path("sample.pcm").read_bytes() await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav // Install helpers for converting files: npm install wavefile import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue \ No newline at end of file diff --git a/docstore/1842ac17-bd3d-4f3d-b0d5-926fb8deda51 b/docstore/1842ac17-bd3d-4f3d-b0d5-926fb8deda51 new file mode 100644 index 0000000000000000000000000000000000000000..6753b1d43ae21393f7ea777aad5afc4becddf540 --- /dev/null +++ b/docstore/1842ac17-bd3d-4f3d-b0d5-926fb8deda51 @@ -0,0 +1 @@ +response will include a thought_signature field containing an encrypted representation of the model's reasoning. Return the signature: When you send the function's execution result back to the server, include the thought_signature you received. This allows the model to restore its previous thinking context and will likely result in better function calling performance. Receiving signatures from the server Signatures are returned in the part after the model's thinking phase, which typically is a text or function call. Here are some examples of what thought signatures look like returned in each type of part, in response to the request "What's the weather in Lake Tahoe?" using the Get Weather example: Text part [{ "candidates" : [ { "content" : { "parts" : [ { "text" : "Here's what the weather in Lake Tahoe is today" , "thoughtSignature" : "ClcBVKhc7ru7KzUI7SrdUoIdAYLm/+i93aHjfIt4xHyAoO/G70tApxnK2ujBhOhC1PrRy1pkQa88fqFvpHNVd1HDjNLO7mkp6/hFwE+SPPEB3fh0hs4oM8MKhgIBVKhc7uIGvrS7i/T4HpfbnYrluFfWNjZ62gewqe4cVdR/Dlh+zbjtYmDD0gPZ+SuBO7vvHQdzsjePRP+2Y5XddX6LEf/cGGgakq8EhVvw/a6IVzUO6XmpHg2Ag1sl8E9+VFH/lC0R0ZuYdFWligtDuYwp5p5q3o59G0TtWeU2MC1y2MJfE9u/KWd313ldka80/X2W/xF2O/4djMp5G2WKcULfve75zeRCy0mc5iS3SB9mTH0cT6x0vtKjeBx50gcg+CQWtJcRuwTVzz54dmvmK9xvnqA8gKGw3DuaM9wfy5hyY7Qg0z3iyyWdP8T/lbjKim8IEQOk7O1vVwP1Ko7oMYH8JgA1CsoBAVSoXO6v4c5RSyd1cn6EIU0pEFQsjW7rYWPuZdOFq/tsGJT9BCfW7KGkPGwlNSq8jTJFvbcJ/DjtndISQYXwiXd2kGa5JfdS2Kh4zOxCxiWtOk+2nCc3+XQk2nonhO+esGJpkDdbbHZSqRgcUtYKq7q28iPFOQvOFyCiZNB7K86Z/6Hnagu2snSlN/BcTMaFGaWpcCClSUo4foRZn3WbNCoM8rcpD7qEJMp4a5baaSxyyeL1ZTGd2HLpFys/oiW6e3oAnhxuIysCwg==" } ] , "role" : "model" } , "index" : 0 } ] , # Remainder of response... Function call part [{ "candidates" : [ { "content" : { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/1858f855-3048-46f0-821b-1cf26c7fb4b3 b/docstore/1858f855-3048-46f0-821b-1cf26c7fb4b3 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/1858f855-3048-46f0-821b-1cf26c7fb4b3 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/1882c530-c01b-4270-b344-7d40dda2da0d b/docstore/1882c530-c01b-4270-b344-7d40dda2da0d new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/1882c530-c01b-4270-b344-7d40dda2da0d @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/188c6cc2-e4de-4a81-8f2a-eaec55341e9a b/docstore/188c6cc2-e4de-4a81-8f2a-eaec55341e9a new file mode 100644 index 0000000000000000000000000000000000000000..8af7573f633bc8337efd3a0ab87cdc8a90abf578 --- /dev/null +++ b/docstore/188c6cc2-e4de-4a81-8f2a-eaec55341e9a @@ -0,0 +1 @@ +Studio Our fastest multimodal model with great performance for diverse, repetitive tasks and a 1 million token context window. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge $0.075, prompts <= 128k tokens $0.15, prompts > 128k tokens Output price Free of charge $0.30, prompts <= 128k tokens $0.60, prompts > 128k tokens Context caching price Free of charge, up to 1 million tokens of storage per hour $0.01875, prompts <= 128k tokens $0.0375, prompts > 128k tokens Context caching (storage) Free of charge $1.00 per hour Tuning price Token prices are the same for tuned models Tuning service is free of charge. Token prices are the same for tuned models Tuning service is free of charge. Grounding with Google Search Not available $35 / 1K grounding requests Used to improve our products Yes No Gemini 1.5 Flash-8B Try it in Google AI Studio Our smallest model for lower intelligence use cases, with a 1 million token context window. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge $0.0375, prompts <= 128k tokens $0.075, prompts > 128k tokens Output price Free of charge $0.15, prompts <= 128k tokens $0.30, prompts > 128k tokens Context caching price Free of charge, up to 1 million tokens of storage per hour $0.01, prompts <= 128k tokens $0.02, prompts > 128k tokens Context caching (storage) Free of charge $0.25 per hour Tuning price Token prices are the same for tuned models Tuning service is free of charge. Token prices are the same for tuned models Tuning service is free of charge. Grounding with Google Search Not available $35 / 1K grounding requests Used to improve our products Yes No Gemini 1.5 Pro Try it in Google AI Studio Our highest intelligence Gemini 1.5 series model, with a breakthrough 2 million token context window. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge $1.25, prompts <= 128k tokens $2.50, prompts > 128k tokens Output price Free of charge $5.00, prompts <= 128k tokens $10.00, prompts > 128k \ No newline at end of file diff --git a/docstore/188d4f83-bc3f-485b-bf1a-78e904a4023a b/docstore/188d4f83-bc3f-485b-bf1a-78e904a4023a new file mode 100644 index 0000000000000000000000000000000000000000..c6a0ed5e9252f2150e0b1d0b64993cc77cfd2978 --- /dev/null +++ b/docstore/188d4f83-bc3f-485b-bf1a-78e904a4023a @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting-strategies#prefixes Title: Prompt design strategies | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/189d5779-0d56-424a-8462-597da25ed393 b/docstore/189d5779-0d56-424a-8462-597da25ed393 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/189d5779-0d56-424a-8462-597da25ed393 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/18c98164-ae6c-45b0-a0c2-b083f234e453 b/docstore/18c98164-ae6c-45b0-a0c2-b083f234e453 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/18c98164-ae6c-45b0-a0c2-b083f234e453 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/18cc2128-99f7-45b5-bde1-d8de45a25e8a b/docstore/18cc2128-99f7-45b5-bde1-d8de45a25e8a new file mode 100644 index 0000000000000000000000000000000000000000..dc35916dc3478088071aa0ef0f9dbb66dbfa3387 --- /dev/null +++ b/docstore/18cc2128-99f7-45b5-bde1-d8de45a25e8a @@ -0,0 +1 @@ +response . arrayBuffer ()); const fileBlob = new Blob ([ pdfBuffer ], { type : 'application/pdf' }); const file = await ai . files . upload ({ file : fileBlob , config : { displayName : 'A17_FlightPlan.pdf' , }, }); // Wait for the file to be processed. let getFile = await ai . files . get ({ name : file . name }); while ( getFile . state === 'PROCESSING' ) { getFile = await ai . files . get ({ name : file . name }); console . log ( `current file status: ${ getFile . state } ` ); console . log ( 'File is still processing, retrying in 5 seconds' ); await new Promise (( resolve ) = > { setTimeout ( resolve , 5000 ); }); } if ( file . state === 'FAILED' ) { throw new Error ( 'File processing failed.' ); } // Add the file to the contents. const content = [ 'Summarize this document' , ]; if ( file . uri && file . mimeType ) { const fileContent = createPartFromUri ( file . uri , file . mimeType ); content . push ( fileContent ); } const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : content , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "io" "net/http" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , _ := genai . NewClient ( ctx , & genai . ClientConfig { APIKey : os . Getenv ( "GEMINI_API_KEY" ), Backend : genai . BackendGeminiAPI , }) pdfURL := "https://www.nasa.gov/wp-content/uploads/static/history/alsj/a17/A17_FlightPlan.pdf" localPdfPath := "A17_FlightPlan_downloaded.pdf" respHttp , _ := http . Get ( pdfURL ) defer respHttp . Body . Close () outFile , _ := os . Create ( localPdfPath ) defer outFile . Close () _ , _ = io . Copy ( outFile , respHttp . Body ) uploadConfig := & genai . UploadFileConfig { MIMEType : "application/pdf" } uploadedFile , _ := client . Files . UploadFromPath ( ctx , localPdfPath , uploadConfig ) promptParts := [] * genai . Part { genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), genai . \ No newline at end of file diff --git a/docstore/18e55130-39c9-4cc7-89e6-56001add751a b/docstore/18e55130-39c9-4cc7-89e6-56001add751a new file mode 100644 index 0000000000000000000000000000000000000000..d028e190ded4732dff341fe647fe7532c210272b --- /dev/null +++ b/docstore/18e55130-39c9-4cc7-89e6-56001add751a @@ -0,0 +1 @@ +second image shows an empty hallway in a shopping mall, which is a place where people often go to buy things for their trips. The third image shows a stack of toilet paper, which is something that people often use when they are traveling. The fourth image shows a dark and stormy sea, which is something that people often encounter when they are traveling. The same tips work for creative output from images. In the following example with the rabbit, explicitly asking the model to include aspects from the image can improve the output. Prompt Model response Can you write me a poem about this image? Little bunny, so soft and brown, Hopping through the field, up and down. Your ears so long, your tail so small, You're the cutest creature of them all. You nibble on the grass, so green and sweet, And hop away when you see me. But I'll always be your friend, Little bunny, until the very end. Updated prompt Improved response Can you write me a descriptive and dramatic poem about this image and include the location? Little bunny on the green, Ears so long, so keen. What do you see out there, In the distance, where you stare? Is it the ocean blue, Sparkling in the sun's hue? Or the waves that crash and roar, A rhythm that you adore? Little bunny, tell me true, What captures your attention so? Is it the salty breeze, Or the seagulls' cries that please? Perhaps it's the vastness of it all, A world so big and small. A place where dreams can roam, And adventures can be sown. Little bunny, I wonder what you dream, As you sit on the grass, so serene. Do you yearn to explore the deep, Or stay on land, where you can leap? Whatever it may be, little bunny, Keep that spark of wonder burning brightly. For in your dreams and aspirations, Lies a world waiting for your creation. Troubleshooting which part of the prompt failed It can be hard to know whether a prompt failed because the model didn't understand the image to begin with, or if it did understand the image but did not perform the \ No newline at end of file diff --git a/docstore/18e7535b-cb80-432d-95f1-3fa4bf80a719 b/docstore/18e7535b-cb80-432d-95f1-3fa4bf80a719 new file mode 100644 index 0000000000000000000000000000000000000000..b56dfbca2298dddb94eb66ba2a6b5e2a4d7109fa --- /dev/null +++ b/docstore/18e7535b-cb80-432d-95f1-3fa4bf80a719 @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "' " ${ MIME1_TYPE } " '", "file_uri": ' $file1_uri '}}, { "inline_data": { "mime_type":"' " ${ MIME2_TYPE } " '", "data": "' " $IMAGE2_BASE64 " '" } } ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Object detection From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. Python from google import genai from google.genai import types from PIL import Image import json client = genai . Client () prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image . open ( "/path/to/image.png" ) config = types . GenerateContentConfig ( response_mime_type = "application/json" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , prompt ], config = config ) width , height = image . size bounding_boxes = json . loads ( response . text ) converted_bounding_boxes = [] for bounding_box in bounding_boxes : abs_y1 = int ( bounding_box [ "box_2d" ][ 0 ] / 1000 * height ) abs_x1 = int ( bounding_box [ "box_2d" ][ 1 ] / 1000 * width ) abs_y2 = int ( bounding_box [ "box_2d" ][ 2 ] / 1000 * height ) abs_x2 = int ( bounding_box [ "box_2d" ][ 3 ] / 1000 * width ) converted_bounding_boxes . append ([ abs_x1 , abs_y1 , abs_x2 , abs_y2 ]) print ( "Image size: " , width , height ) print ( "Bounding boxes:" , converted_bounding_boxes ) Note: The model also supports generating bounding boxes based on custom \ No newline at end of file diff --git a/docstore/190f3f12-6f47-40d9-8f95-3524f5fc9471 b/docstore/190f3f12-6f47-40d9-8f95-3524f5fc9471 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/190f3f12-6f47-40d9-8f95-3524f5fc9471 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/190fa881-6508-49a3-aa7f-dfb88045c98f b/docstore/190fa881-6508-49a3-aa7f-dfb88045c98f new file mode 100644 index 0000000000000000000000000000000000000000..4a588ba82271aaa08f2df12295049c23bd57f7eb --- /dev/null +++ b/docstore/190fa881-6508-49a3-aa7f-dfb88045c98f @@ -0,0 +1 @@ +, ) print ( response . text ) # The SDK handles the function call and returns the final text You can disable automatic function calling with: Python config = types . GenerateContentConfig ( tools = [ get_current_temperature ], automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ) ) Automatic function schema declaration Automatic schema extraction from Python functions doesn't work in all cases. For example, it doesn't handle cases where you describe the fields of a nested dictionary-object. The API is able to describe any of the following types: Python AllowedType = ( int | float | bool | str | list [ 'AllowedType' ] | dict [ str , AllowedType ]) To see what the inferred schema looks like, you can convert it using from_callable : Python def multiply ( a : float , b : float ): """Returns a * b.""" return a * b fn_decl = types . FunctionDeclaration . from_callable ( callable = multiply , client = client ) # to_json_dict() provides a clean JSON representation. print ( fn_decl . to_json_dict ()) Multi-tool use: Combine native tools with function calling You can enable multiple tools combining native tools with function calling at the same time. Here's an example that enables two tools, Grounding with Google Search and code execution , in a request using the Live API . Note: Multi-tool use is a- Live API only feature at the moment. The run() function declaration, which handles the asynchronous websocket setup, is omitted for brevity. Python # Multiple tasks example - combining lights, code execution, and search prompt = """ Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! """ tools = [ { 'google_search' : {}}, { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} # not defined here. \ No newline at end of file diff --git a/docstore/191e41a8-ed79-4b43-9b7f-3dc83af97330 b/docstore/191e41a8-ed79-4b43-9b7f-3dc83af97330 new file mode 100644 index 0000000000000000000000000000000000000000..2e8ac78d46cd543ec267658a40dc30e0a2feb077 --- /dev/null +++ b/docstore/191e41a8-ed79-4b43-9b7f-3dc83af97330 @@ -0,0 +1 @@ +Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { SystemInstruction : genai . NewContentFromText ( "You are a cat. Your name is Neko." , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Hello there" ), config , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "system_instruction": { "parts": [ { "text": "You are a cat. Your name is Neko." } ] }, "contents": [ { "parts": [ { "text": "Hello there" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const systemInstruction = { parts : [{ text : 'You are a cat. Your name is Neko.' }] }; const payload = { systemInstruction , contents : [ { parts : [ { text : 'Hello there' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } The GenerateContentConfig object also lets you override default generation parameters, such as temperature . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model \ No newline at end of file diff --git a/docstore/193c6fdb-dc9f-4f88-a06c-22a69db20eee b/docstore/193c6fdb-dc9f-4f88-a06c-22a69db20eee new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/193c6fdb-dc9f-4f88-a06c-22a69db20eee @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/196ff2b1-9802-48af-9af7-4bac67f52995 b/docstore/196ff2b1-9802-48af-9af7-4bac67f52995 new file mode 100644 index 0000000000000000000000000000000000000000..9b0480f81615786d4da1611ac72ab96b5af43602 --- /dev/null +++ b/docstore/196ff2b1-9802-48af-9af7-4bac67f52995 @@ -0,0 +1 @@ +"CiIBVKhc7oDPpCaXyJKKssjqr4g3JNOSgJ/M2V+1THC1icsWCmwBVKhc7pBABbZ+zR3e9234WnWWS6GFXmf8IVwpnzjd5KYd7vyJbn/4vTorWBGayj/vbd9JPaZQjxdAIXhoE5mX/MDsQ7M9N/b0qJjHm39tYIBvS4sIWkMDHqTJqXGLzhhKtrTkfbV3RbaJEkQKmwEBVKhc7qVUgC3hfTXZLo9R3AJzUUIx50NKvJTb9B+UU+LBqgg7Nck1x5OpjWVS2R+SsveprIuYOruk2Y0H53J2OJF8qsxTdIq2si8DGW2V7WK8xyoJH5kbqd7drIw1jLb44b6lx4SMyB0VaULuTBki4d+Ljjg1tJTwR0IYMKqDLDZt9mheINsi0ZxcNjfpnDydRXdWbcSwzmK/wgqJAQFUqFzuKgNVElxs3cbO+xebr2IwcOro84nKTisi0tTp9bICPC9fTUhn3L+rvQWA+d3J1Za8at2bakrqiRj7BTh+CVO9fWQMAEQAs3ni0Z2hfaYG92tOD26E4IoZwyYEoWbfNudpH1fr5tEkyqnEGtWIh7H+XoZQ2DXeiOa+br7Zk88SrNE+trJMCogBAVSoXO5e9fBLg7hnbkmKsrzNLnQtLsQm1gNzjcjEC7nJYklYPp0KI2uGBE1PkM8XNsfllAfHVn7LzHcHNlbQ9pJ7QZTSIeG42goS971r5wNZwxaXwCTphClQh826eqJWo6A/28TtAVQWLhTx5ekbP7qb4nh1UblESZ1saxDQAEo4OKPbDzx5BgqKAQFUqFzuVyjNm5i0wN8hTDnKjfpDroEpPPTs531iFy9BOX+xDCdGHy8D+osFpaoBq6TFekQQbz4hIoUR1YEcP4zI80/cNimEeb9IcFxZTTxiNrbhbbcv0969DSMWhB+ZEqIz4vuw4GLe/xcUvqhlChQwFdgIbdOQHSHpatn5uDlktnP/bi26nKuXIwo0AVSoXO7US22OUH7d1f4abNPI0IyAvhqkPp12rbtWLx9vkOtojE8IP+xCfYtIFuZIzRNZqA==" } ] , "role" : "model" } , { "role" : "user" , "parts" : [ { "functionResponse" : { "name" : "getWeather" , "response" : { "response" : { "stringValue" : "Sunny and hot. 90 degrees Fahrenheit" } } } } ] } ] , # Remainder of request... Learn more about limitations and usage of thought signatures, and about thinking models in general, on the Thinking page. Parallel function calling In addition to single turn function calling, you can also call multiple functions at once. Parallel function calling lets you execute multiple functions at once and is used when the functions are not dependent on each other. This is useful in scenarios like gathering data from multiple independent sources, such as retrieving customer details from different databases or checking inventory levels across various warehouses or performing multiple actions such as converting your apartment into a disco. Python power_disco_ball = { "name" : "power_disco_ball" , "description" \ No newline at end of file diff --git a/docstore/19828a48-4403-47b3-91f6-5cd3237adefd b/docstore/19828a48-4403-47b3-91f6-5cd3237adefd new file mode 100644 index 0000000000000000000000000000000000000000..46b1ab716068a90ca8b9aaaffe42e5334bcea2c0 --- /dev/null +++ b/docstore/19828a48-4403-47b3-91f6-5cd3237adefd @@ -0,0 +1 @@ +Batch Mode | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Batch Mode The Gemini API's Batch Mode is designed to process large volumes of requests asynchronously at 50% of the standard cost . The target turnaround time is 24 hours, but in majority of cases, it is much quicker. Use Batch Mode for large-scale, non-urgent tasks such as data pre-processing or running evaluations where an immediate response is not required. Note: You can use Batch Mode with the Gemini API Python SDK or the REST API. Support for Batch Mode in the Gemini API JavaScript SDK is coming soon. Getting Started This section helps you get started with submitting your first requests in batch mode. Creating a batch job You have two ways to submit your requests in Batch Mode: Inline Requests : A list of GenerateContentRequest objects directly included in your batch creation request. This is suitable for smaller batches that keep the total request size under 20MB. The output returned from the model is a list of inlineResponse objects. Input File : A JSON Lines (JSONL) file where each line contains a complete GenerateContentRequest object. This method is recommended for larger requests. The output returned from the model is a JSONL file where each line is either a GenerateContentResponse or a status object. Inline requests For a small number of requests, you can directly embed the GenerateContentRequest objects within your BatchGenerateContentRequest . The following example calls the BatchGenerateContent method with inline requests: Python from google import genai from google.genai import types client = genai . Client () # A list of dictionaries, where each is a \ No newline at end of file diff --git a/docstore/19d1017b-c375-4762-89d3-34a192d779d7 b/docstore/19d1017b-c375-4762-89d3-34a192d779d7 new file mode 100644 index 0000000000000000000000000000000000000000..10d595bd2c735f8912abb00e69220b9ae90d3d23 --- /dev/null +++ b/docstore/19d1017b-c375-4762-89d3-34a192d779d7 @@ -0,0 +1 @@ +Audio understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Audio understanding Gemini can analyze and understand audio input, enabling use cases like the following: Describe, summarize, or answer questions about audio content. Provide a transcription of the audio. Analyze specific segments of the audio. This guide shows you how to use the Gemini API to generate a text response to audio input. Before you begin Before calling the Gemini API, ensure you have your SDK of choice installed, and a Gemini API key configured and ready to use. Input audio You can provide audio data to Gemini in the following ways: Upload an audio file before making a request to generateContent . Pass inline audio data with the request to generateContent . Upload an audio file You can use the Files API to upload an audio file. Always use the Files API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20 MB. The following code uploads an audio file and then uses the file in a call to generateContent . Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp3" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "Describe this audio clip" , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mp3" }, }); const response = await ai . models . generateContent ({ \ No newline at end of file diff --git a/docstore/19dbb796-5a33-453a-870a-8bb4f9851b01 b/docstore/19dbb796-5a33-453a-870a-8bb4f9851b01 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/19dbb796-5a33-453a-870a-8bb4f9851b01 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/1a213641-9eba-43f1-a679-478b3f3e29dc b/docstore/1a213641-9eba-43f1-a679-478b3f3e29dc new file mode 100644 index 0000000000000000000000000000000000000000..1b38806cd93f51329872622740eee5221d7a7d7e --- /dev/null +++ b/docstore/1a213641-9eba-43f1-a679-478b3f3e29dc @@ -0,0 +1 @@ +18°C." , config = config , ) # Print the final, user-facing response print ( response . text ) Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . I 've set the thermostat to 20°C. JavaScript This example shows how to use JavaScript/TypeScript SDK to do comopositional function calling using a manual execution loop. import { GoogleGenAI , Type } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Example Functions function get_weather_forecast ({ location }) { console . log ( `Tool Call: get_weather_forecast(location= ${ location } )` ); // TODO: Make API call console . log ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ); return { temperature : 25 , unit : "celsius" }; } function set_thermostat_temperature ({ temperature }) { console . log ( `Tool Call: set_thermostat_temperature(temperature= ${ temperature } )` , ); // TODO: Make API call console . log ( "Tool Response: {'status': 'success'}" ); return { status : "success" }; } const toolFunctions = { get_weather_forecast , set_thermostat_temperature , }; const tools = [ { functionDeclarations : [ { name : "get_weather_forecast" , description : "Gets the current weather temperature for a given location." , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , }, }, required : [ "location" ], }, }, { name : "set_thermostat_temperature" , description : "Sets the thermostat to a desired temperature." , parameters : { type : Type . OBJECT , properties : { temperature : { type : Type . NUMBER , }, }, required : [ \ No newline at end of file diff --git a/docstore/1a2571ee-4eba-41dc-8ec1-1711116eadc0 b/docstore/1a2571ee-4eba-41dc-8ec1-1711116eadc0 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/1a2571ee-4eba-41dc-8ec1-1711116eadc0 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/1a2f96e6-c587-472d-8f68-63dc4aa376c0 b/docstore/1a2f96e6-c587-472d-8f68-63dc4aa376c0 new file mode 100644 index 0000000000000000000000000000000000000000..465170b0f695eefaa56d7f3871f4f3df2424da62 --- /dev/null +++ b/docstore/1a2f96e6-c587-472d-8f68-63dc4aa376c0 @@ -0,0 +1 @@ +brightness color_temp: Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. Returns: A dictionary containing the set brightness and color temperature. """ return { "brightness" : brightness , "colorTemperature" : color_temp } JavaScript import { Type } from '@google/genai' ; // Define a function that the model can call to control smart lights const setLightValuesFunctionDeclaration = { name : 'set_light_values' , description : 'Sets the brightness and color temperature of a light.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'Light level from 0 to 100. Zero is off and 100 is full brightness' , }, color_temp : { type : Type . STRING , enum : [ 'daylight' , 'cool' , 'warm' ], description : 'Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`.' , }, }, required : [ 'brightness' , 'color_temp' ], }, }; /** * Set the brightness and color temperature of a room light. (mock API) * @param {number} brightness - Light level from 0 to 100. Zero is off and 100 is full brightness * @param {string} color_temp - Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. * @return {Object} A dictionary containing the set brightness and color temperature. */ function setLightValues ( brightness , color_temp ) { return { brightness : brightness , colorTemperature : color_temp }; } Step 2: Call the model with function declarations Once you have defined your function declarations, you can prompt the model to use them. It analyzes the prompt and function declarations and decides whether to respond directly or to call a function. If a function is called, the response object will contain a function call suggestion. Python from google.genai import types # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ set_light_values_declaration ]) config = types . GenerateContentConfig ( tools = [ \ No newline at end of file diff --git a/docstore/1a33c4d2-fc51-4521-a87e-c2e7196fbcf6 b/docstore/1a33c4d2-fc51-4521-a87e-c2e7196fbcf6 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/1a33c4d2-fc51-4521-a87e-c2e7196fbcf6 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/1a3f3a32-38ed-49b9-85b2-f9bed821ea35 b/docstore/1a3f3a32-38ed-49b9-85b2-f9bed821ea35 new file mode 100644 index 0000000000000000000000000000000000000000..fdea6397d0ee0c5ce13453eceb7f458532b87688 --- /dev/null +++ b/docstore/1a3f3a32-38ed-49b9-85b2-f9bed821ea35 @@ -0,0 +1 @@ +"BLOCK_MEDIUM_AND_ABOVE"} ], "contents": [{ "parts":[{ "text": "' I support Martians Soccer Club and I think Jupiterians Football Club sucks! Write a ironic phrase about them. '"}]}]}' > request.json curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d @request.json 2 > /dev/null Next steps See the API reference to learn more about the full API. Review the safety guidance for a general look at safety considerations when developing with LLMs. Learn more about assessing probability versus severity from the Jigsaw team Learn more about the products that contribute to safety solutions like the Perspective API . * You can use these safety settings to create a toxicity classifier. See the classification example to get started. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/1a53f5b3-b9af-4afa-8bab-39379b0b2b10 b/docstore/1a53f5b3-b9af-4afa-8bab-39379b0b2b10 new file mode 100644 index 0000000000000000000000000000000000000000..019b7de7e49d445c43758810d78952e4f88cd47b --- /dev/null +++ b/docstore/1a53f5b3-b9af-4afa-8bab-39379b0b2b10 @@ -0,0 +1 @@ +prompt const contents = [ { role : 'user' , parts : [{ text : 'Turn the lights down to a romantic level' }] } ]; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( response . functionCalls [ 0 ]); The model then returns a functionCall object in an OpenAPI compatible schema specifying how to call one or more of the declared functions in order to respond to the user's question. Python id = None args = { 'color_temp' : 'warm' , 'brightness' : 25 } name = 'set_light_values' JavaScript { name : 'set_light_values' , args : { brightness : 25 , color_temp : 'warm' } } Step 3: Execute set_light_values function code Extract the function call details from the model's response, parse the arguments , and execute the set_light_values function. Python # Extract tool call details, it may not be in the first part. tool_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call if tool_call . name == "set_light_values" : result = set_light_values ( ** tool_call . args ) print ( f "Function execution result: { result } " ) JavaScript // Extract tool call details const tool_call = response . functionCalls [ 0 ] let result ; if ( tool_call . name === 'set_light_values' ) { result = setLightValues ( tool_call . args . brightness , tool_call . args . color_temp ); console . log ( `Function execution result: ${ JSON . stringify ( result ) } ` ); } Step 4: Create user friendly response with function result and call the model again Finally, send the result of the function execution back to the model so it can incorporate this information into its final response to the user. Python # Create a function response part function_response_part = types . Part . from_function_response ( name = tool_call . name , response = { "result" : result }, ) # Append function call and result of the function execution to contents contents . append ( response . \ No newline at end of file diff --git a/docstore/1a8897fb-d99a-45e7-82de-48e70fc83bf0 b/docstore/1a8897fb-d99a-45e7-82de-48e70fc83bf0 new file mode 100644 index 0000000000000000000000000000000000000000..de98b41ec31106077167d65dc0d83dfd4822d872 --- /dev/null +++ b/docstore/1a8897fb-d99a-45e7-82de-48e70fc83bf0 @@ -0,0 +1 @@ +moment. The run() function declaration, which handles the asynchronous websocket setup, is omitted for brevity. Python # Multiple tasks example - combining lights, code execution, and search prompt = """ Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! """ tools = [ { 'google_search' : {}}, { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} # not defined here. ] # Execute the prompt with specified tools in audio modality await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Multiple tasks example - combining lights, code execution, and search const prompt = ` Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! ` ; const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } // not defined here. ]; // Execute the prompt with specified tools in audio modality await run ( prompt , { tools : tools , modality : "AUDIO" }); Python developers can try this out in the Live API Tool Use notebook . Model context protocol (MCP) Model Context Protocol (MCP) is an open standard for connecting AI applications with external tools and data. MCP provides a common protocol for models to access context, such as functions (tools), data sources (resources), or predefined prompts. The Gemini SDKs have built-in support for the MCP, reducing boilerplate code and offering automatic tool calling for MCP tools. When the model generates an MCP tool call, the Python and JavaScript client SDK can automatically execute the MCP tool and send the \ No newline at end of file diff --git a/docstore/1ab51611-d981-4a4d-94fd-298f07b39f6c b/docstore/1ab51611-d981-4a4d-94fd-298f07b39f6c new file mode 100644 index 0000000000000000000000000000000000000000..5517c0bf6b0252fb1080acbdd22b2b409d2663d8 --- /dev/null +++ b/docstore/1ab51611-d981-4a4d-94fd-298f07b39f6c @@ -0,0 +1 @@ +signatures, function call and result of the function execution to contents function_call_content = response . candidates [ 0 ] . content # Append the model's function call message, which includes thought signatures contents . append ( function_call_content ) contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Step 4: Create user friendly response with function result and call the model again // ...Create a function response part (No change) // Append thought signatures, function call and result of the function execution to contents const function_response_content = response . candidates [ 0 ]. content ; contents . push ( function_response_content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); The following shows what a request returning a thought signature may look like: [{ "contents" : [ { "role" : "user" , "parts" : [ { "text" : "what is the weather in Lake Tahoe?" } ] } , { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/1abbd8e9-ebbd-476c-834a-cf8521830528 b/docstore/1abbd8e9-ebbd-476c-834a-cf8521830528 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/1abbd8e9-ebbd-476c-834a-cf8521830528 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/1abfd323-8711-40f2-b533-05460ce15086 b/docstore/1abfd323-8711-40f2-b533-05460ce15086 new file mode 100644 index 0000000000000000000000000000000000000000..ed4fdba10a6b64b8e5b2459769ecb435f9ec471b --- /dev/null +++ b/docstore/1abfd323-8711-40f2-b533-05460ce15086 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/image-generation#gemini Title: Image generation | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/1acde25e-0d29-486f-8c37-14517dc69265 b/docstore/1acde25e-0d29-486f-8c37-14517dc69265 new file mode 100644 index 0000000000000000000000000000000000000000..c651df5a8dc16b6ecacfcfbb978b8a98fa66456b --- /dev/null +++ b/docstore/1acde25e-0d29-486f-8c37-14517dc69265 @@ -0,0 +1 @@ +Live API capabilities guide | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Live API capabilities guide Preview: The Live API is in preview. This is a comprehensive guide that covers capabilities and configurations available with the Live API. See Get started with Live API page for a overview and sample code for common use cases. Before you begin Familiarize yourself with core concepts: If you haven't already done so, read the Get started with Live API page first. This will introduce you to the fundamental principles of the Live API, how it works, and the distinction between the different models and their corresponding audio generation methods ( native audio or half-cascade). Try the Live API in AI Studio: You may find it useful to try the Live API in Google AI Studio before you start building. To use the Live API in Google AI Studio, select Stream . Establishing a connection The following example shows how to create a connection with an API key: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : print ( "Session started" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function main () { const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug \ No newline at end of file diff --git a/docstore/1ad26033-22d9-4afe-b298-f30a3eb52a34 b/docstore/1ad26033-22d9-4afe-b298-f30a3eb52a34 new file mode 100644 index 0000000000000000000000000000000000000000..1928fbda4690570381db2fc0734d5c40f27390c8 --- /dev/null +++ b/docstore/1ad26033-22d9-4afe-b298-f30a3eb52a34 @@ -0,0 +1 @@ +Part { InlineData : & genai . Blob { MIMEType : "audio/mp3" , Data : audioBytes , }, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } A few things to keep in mind about inline audio data: The maximum request size is 20 MB, which includes text prompts, system instructions, and files provided inline. If your file's size will make the total request size exceed 20 MB, then use the Files API to upload an audio file for use in the request. If you're using an audio sample multiple times, it's more efficient to upload an audio file . Get a transcript To get a transcript of audio data, just ask for it in the prompt: Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) prompt = 'Generate a transcript of the speech.' response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ prompt , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Generate a transcript of the speech." , ]), }); console . log ( "result.text=" , result . text ); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } localAudioPath := "/path/to/sample.mp3" uploadedFile , _ := client . Files . UploadFromPath ( ctx , localAudioPath , nil , ) parts := [] * genai . Part { genai . NewPartFromText ( "Generate a transcript of the speech." ), \ No newline at end of file diff --git a/docstore/1adf5b26-2c38-45ea-8085-d72915d75cff b/docstore/1adf5b26-2c38-45ea-8085-d72915d75cff new file mode 100644 index 0000000000000000000000000000000000000000..7c01a8e52c6f77d499351d1468f50bccd1328e77 --- /dev/null +++ b/docstore/1adf5b26-2c38-45ea-8085-d72915d75cff @@ -0,0 +1 @@ +function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); // Combine audio data strings and save as wave file const combinedAudio = turns . reduce (( acc , turn ) = > { if ( turn . data ) { const buffer = Buffer . from ( turn . data , 'base64' ); const intArray = new Int16Array ( buffer . buffer , buffer . byteOffset , buffer . byteLength / Int16Array . BYTES_PER_ELEMENT ); return acc . concat ( Array . from ( intArray )); } return acc ; }, []); const audioBuffer = new Int16Array ( combinedAudio ); const wf = new WaveFile (); wf . fromScratch ( 1 , 24000 , '16' , audioBuffer ); fs . writeFileSync ( 'output.wav' , wf . toBuffer ()); session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Audio formats Audio data in the Live API is always raw, little-endian, 16-bit PCM. Audio output always uses a sample rate of 24kHz. Input audio is natively 16kHz, but the Live API will resample if needed so any sample rate can be sent. To convey the sample rate of input audio, set the MIME type of each audio-containing Blob to a value like audio/pcm;rate=16000 . Audio transcriptions You can enable transcription of the model's audio output by sending output_audio_transcription in the setup config. The transcription language is inferred from the model's response. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ], "output_audio_transcription" : {} } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello? Gemini are you there?" await session . send_client_content ( turns = { \ No newline at end of file diff --git a/docstore/1b3f7b51-ac0a-470d-ae77-0946b34b20a4 b/docstore/1b3f7b51-ac0a-470d-ae77-0946b34b20a4 new file mode 100644 index 0000000000000000000000000000000000000000..78c2f9bf8b354b9f4d6083220bd2208380f47813 --- /dev/null +++ b/docstore/1b3f7b51-ac0a-470d-ae77-0946b34b20a4 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/rate-limits Title: Rate limits | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/1b57a65e-43bc-4314-aad8-23d3d56d80ca b/docstore/1b57a65e-43bc-4314-aad8-23d3d56d80ca new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/1b57a65e-43bc-4314-aad8-23d3d56d80ca @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/1b5ad7ec-7811-440f-849f-f517cf9675a0 b/docstore/1b5ad7ec-7811-440f-849f-f517cf9675a0 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/1b5ad7ec-7811-440f-849f-f517cf9675a0 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/1b5c5064-3822-4dba-933f-710a0e91a1c7 b/docstore/1b5c5064-3822-4dba-933f-710a0e91a1c7 new file mode 100644 index 0000000000000000000000000000000000000000..3c7819093774ca7626711ba692d14925f51fa93f --- /dev/null +++ b/docstore/1b5c5064-3822-4dba-933f-710a0e91a1c7 @@ -0,0 +1 @@ +text embeddings: Python from google import genai client = genai . Client () result = client . models . embed_content ( model = "gemini-embedding-exp-03-07" , contents = "What is the meaning of life?" ) print ( result . embeddings ) JavaScript import { GoogleGenAI } from "@google/genai" ; async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . embedContent ({ model : 'gemini-embedding-exp-03-07' , contents : 'What is the meaning of life?' , }); console . log ( response . embeddings ); } main (); Go package main import ( "context" "encoding/json" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := [] * genai . Content { genai . NewContentFromText ( "What is the meaning of life?" , genai . RoleUser ), } result , err := client . Models . EmbedContent ( ctx , "gemini-embedding-exp-03-07" , contents , nil , ) if err != nil { log . Fatal ( err ) } embeddings , err := json . MarshalIndent ( result . Embeddings , "" , " " ) if err != nil { log . Fatal ( err ) } fmt . Println ( string ( embeddings )) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-embedding-exp-03-07:embedContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{"model": "models/gemini-embedding-exp-03-07", "content": { "parts":[{ "text": "What is the meaning of life?"}]} }' You can also generate embeddings for multiple chunks at once by passing them in as a list of strings. Task types When building Retrieval Augmented Generation (RAG) systems, a common design is to use text embeddings to perform a similarity search. In some cases this can lead to degraded quality, because questions and their answers are not semantically similar. For example, a question like "Why is the sky blue?" and its answer "The scattering of sunlight causes the blue color," have distinctly different \ No newline at end of file diff --git a/docstore/1b780760-28e2-4670-920a-17e63b210dfb b/docstore/1b780760-28e2-4670-920a-17e63b210dfb new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/1b780760-28e2-4670-920a-17e63b210dfb @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/1b913e7f-a3c5-428f-80d4-879329199b87 b/docstore/1b913e7f-a3c5-428f-80d4-879329199b87 new file mode 100644 index 0000000000000000000000000000000000000000..8ae6e7f8faf81825a858a0fb6935f6e2a3e8dc09 --- /dev/null +++ b/docstore/1b913e7f-a3c5-428f-80d4-879329199b87 @@ -0,0 +1 @@ +help with that, as I'm only a language model." If the model responds with a fallback response, try increasing the temperature. Things to avoid Avoid relying on models to generate factual information. Use with care on math and logic problems. Generative models under the hood This section aims to answer the question - Is there randomness in generative models' responses, or are they deterministic? The short answer - yes to both. When you prompt a generative model, a text response is generated in two stages. In the first stage, the generative model processes the input prompt and generates a probability distribution over possible tokens (words) that are likely to come next. For example, if you prompt with the input text "The dog jumped over the ... ", the generative model will produce an array of probable next words: [("fence", 0.77), ("ledge", 0.12), ("blanket", 0.03), ...] This process is deterministic; a generative model will produce this same distribution every time it's input the same prompt text. In the second stage, the generative model converts these distributions into actual text responses through one of several decoding strategies. A simple decoding strategy might select the most likely token at every timestep. This process would always be deterministic. However, you could instead choose to generate a response by randomly sampling over the distribution returned by the model. This process would be stochastic (random). Control the degree of randomness allowed in this decoding process by setting the temperature. A temperature of 0 means only the most likely tokens are selected, and there's no randomness. Conversely, a high temperature injects a high degree of randomness into the tokens selected by the model, leading to more unexpected, surprising model responses. Next steps Now that you have a deeper understanding of prompt design, try writing your own prompts using Google AI Studio . To learn about multimodal prompting, see Prompting with media files . To learn \ No newline at end of file diff --git a/docstore/1b914a3b-26f6-4879-b2c7-17b3e08dd4e8 b/docstore/1b914a3b-26f6-4879-b2c7-17b3e08dd4e8 new file mode 100644 index 0000000000000000000000000000000000000000..01c8292e0823a69abce36f98d8faed7f3484996b --- /dev/null +++ b/docstore/1b914a3b-26f6-4879-b2c7-17b3e08dd4e8 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-2.0-flash Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/1b95ff85-28a7-4cd7-ad28-abfe61b12a69 b/docstore/1b95ff85-28a7-4cd7-ad28-abfe61b12a69 new file mode 100644 index 0000000000000000000000000000000000000000..12eaa70edcc32ba40cc62f56233fa0adea94b474 --- /dev/null +++ b/docstore/1b95ff85-28a7-4cd7-ad28-abfe61b12a69 @@ -0,0 +1 @@ +Gemini API libraries | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini API libraries When building with the Gemini API, we recommend using our official collection of libraries across major languages: the Google GenAI SDK . They are production ready under General Availability . Our samples and documentation across this site are built using these libraries. Note: If you're using one of our legacy libraries, we strongly recommend you migrate to the Google GenAI SDK. Review the legacy libraries section for more information. If you're new to the Gemini API, follow our quickstart guide to get started. Language support and installation The Google GenAI SDK is available for the Python, JavaScript/TypeScript, Go and Java languages. You can install each language's library using package managers, or visit their GitHub repos for further engagement: Python Library: google-genai GitHub Repository: googleapis/python-genai Installation: pip install google-genai JavaScript Library: @google/genai GitHub Repository: googleapis/js-genai Installation: npm install @google/genai Go Library: google.golang.org/genai GitHub Repository: googleapis/go-genai Installation: go get google.golang.org/genai Java Library: google-genai GitHub Repository: googleapis/java-genai Installation: If you're using Maven, add the following to your dependencies: com.google.genai google-genai 1.0.0 General availability We started rolling out the Google GenAI SDK in late 2024. As of May 2025, it reached General Availability (GA) across all supported platforms. This means \ No newline at end of file diff --git a/docstore/1b9e7d6b-5106-4f13-a3bf-9ef253c4af15 b/docstore/1b9e7d6b-5106-4f13-a3bf-9ef253c4af15 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/1b9e7d6b-5106-4f13-a3bf-9ef253c4af15 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/1b9ee03d-25d5-4113-9b02-bcf0f731b8ac b/docstore/1b9ee03d-25d5-4113-9b02-bcf0f731b8ac new file mode 100644 index 0000000000000000000000000000000000000000..933572d214295ba3502f9da18dc8703a97691ed7 --- /dev/null +++ b/docstore/1b9ee03d-25d5-4113-9b02-bcf0f731b8ac @@ -0,0 +1 @@ +captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). * Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds, and their general characteristics. * Thesis Statement: State the main point of the essay, focusing on the remarkable adaptations, behaviors, and ecological significance that make hummingbirds extraordinary. ..... (gemini-2.5-flash) Zero-shot vs few-shot prompts You can include examples in the prompt that show the model what getting it right looks like. The model attempts to identify patterns and relationships from the examples and applies them when generating a response. Prompts that contain a few examples are called few-shot prompts, while prompts that provide no examples are called zero-shot prompts. Few-shot prompts are often used to regulate the formatting, phrasing, scoping, or general patterning of model responses. Use specific and varied examples to help the model narrow its focus and generate more accurate results. We recommend to always include few-shot examples in your prompts. Prompts without few-shot examples are likely to be less effective. In fact, you can remove instructions from your prompt if your examples are clear enough in showing the task at hand. The following zero-shot prompt asks the model to choose the best explanation. Prompt: Please choose the best explanation to the question: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Explanation1 is the better explanation because it provides more detail on the process, including how ice crystals combine and grow into snowflakes as they fall through the atmosphere. (gemini-2.5-flash) If your use case requires the \ No newline at end of file diff --git a/docstore/1bbed516-e1fe-4f3e-a721-6c14b955178d b/docstore/1bbed516-e1fe-4f3e-a721-6c14b955178d new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/1bbed516-e1fe-4f3e-a721-6c14b955178d @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/1bc9074e-cedf-4664-ba24-da7c220316cf b/docstore/1bc9074e-cedf-4664-ba24-da7c220316cf new file mode 100644 index 0000000000000000000000000000000000000000..a97f8e9b4e6ce1d85f8c4b9bed31a9676bb7c602 --- /dev/null +++ b/docstore/1bc9074e-cedf-4664-ba24-da7c220316cf @@ -0,0 +1 @@ +Files API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Files API The Gemini family of artificial intelligence (AI) models is built to handle various types of input data, including text, images, and audio. Since these models can handle more than one type or mode of data, the Gemini models are called multimodal models or explained as having multimodal capabilities . This guide shows you how to work with media files using the Files API. The basic operations are the same for audio files, images, videos, documents, and other supported file types. For file prompting guidance, check out the File prompt guide section. Upload a file You can use the Files API to upload a media file. Always use the Files API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20 MB. The following code uploads a file and then uses the file in a call to generateContent . Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp3" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ "Describe this audio clip" , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Describe this audio \ No newline at end of file diff --git a/docstore/1bcf1461-d234-4aac-a160-633ce9529ea1 b/docstore/1bcf1461-d234-4aac-a160-633ce9529ea1 new file mode 100644 index 0000000000000000000000000000000000000000..203c1e44cd8be8f0144d13a11f43fb822930ab15 --- /dev/null +++ b/docstore/1bcf1461-d234-4aac-a160-633ce9529ea1 @@ -0,0 +1 @@ +] // MCP Server }); const client = new Client ( { name : "example-client" , version : "1.0.0" } ); // Configure the client const ai = new GoogleGenAI ({}); // Initialize the connection between client and server await client . connect ( serverParams ); // Send request to the model with MCP tools const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : `What is the weather in London in ${ new Date (). toLocaleDateString () } ?` , config : { tools : [ mcpToTool ( client )], // uses the session, will automatically call the tool // Uncomment if you **don't** want the sdk to automatically call the tool // automaticFunctionCalling: { // disable: true, // }, }, }); console . log ( response . text ) // Close the connection await client . close (); Limitations with built-in MCP support Built-in MCP support is a experimental feature in our SDKs and has the following limitations: Only tools are supported, not resources nor prompts It is available for the Python and JavaScript/TypeScript SDK. Breaking changes might occur in future releases. Manual integration of MCP servers is always an option if these limit what you're building. Supported models This section lists models and their function calling capabilities. Experimental models are not included. You can find a comprehensive capabilities overview on the model overview page. Model Function Calling Parallel Function Calling Compositional Function Calling Gemini 2.5 Pro ✔️ ✔️ ✔️ Gemini 2.5 Flash ✔️ ✔️ ✔️ Gemini 2.5 Flash-Lite ✔️ ✔️ ✔️ Gemini 2.0 Flash ✔️ ✔️ ✔️ Gemini 2.0 Flash-Lite X X X Best practices Function and Parameter Descriptions: Be extremely clear and specific in your descriptions. The model relies on these to choose the correct function and provide appropriate arguments. Naming: Use descriptive function names (without spaces, periods, or dashes). Strong Typing: Use specific types (integer, string, enum) for parameters to reduce errors. If a parameter has a limited set of valid \ No newline at end of file diff --git a/docstore/1bec611c-1444-4748-bf86-4398b228c7c7 b/docstore/1bec611c-1444-4748-bf86-4398b228c7c7 new file mode 100644 index 0000000000000000000000000000000000000000..02741f017de0a4f2326222e7e2c63ce436f783ef --- /dev/null +++ b/docstore/1bec611c-1444-4748-bf86-4398b228c7c7 @@ -0,0 +1 @@ +unsafe prompt." ), config , ) if err != nil { log . Fatal ( err ) } fmt . Println ( response . Text ()) } JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const safetySettings = [ { category : "HARM_CATEGORY_HARASSMENT" , threshold : "BLOCK_LOW_AND_ABOVE" , }, { category : "HARM_CATEGORY_HATE_SPEECH" , threshold : "BLOCK_LOW_AND_ABOVE" , }, ]; async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Some potentially unsafe prompt." , config : { safetySettings : safetySettings , }, }); console . log ( response . text ); } await main (); Dart (Flutter) final safetySettings = [ SafetySetting ( HarmCategory . harassment , HarmBlockThreshold . low ), SafetySetting ( HarmCategory . hateSpeech , HarmBlockThreshold . low ), ]; final model = GenerativeModel ( model: 'gemini-1.5-flash' , apiKey: apiKey , safetySettings: safetySettings , ); Kotlin val harassmentSafety = SafetySetting ( HarmCategory . HARASSMENT , BlockThreshold . LOW_AND_ABOVE ) val hateSpeechSafety = SafetySetting ( HarmCategory . HATE_SPEECH , BlockThreshold . LOW_AND_ABOVE ) val generativeModel = GenerativeModel ( modelName = "gemini-1.5-flash" , apiKey = BuildConfig . apiKey , safetySettings = listOf ( harassmentSafety , hateSpeechSafety ) ) Java SafetySetting harassmentSafety = new SafetySetting ( HarmCategory . HARASSMENT , BlockThreshold . LOW_AND_ABOVE ); SafetySetting hateSpeechSafety = new SafetySetting ( HarmCategory . HATE_SPEECH , BlockThreshold . LOW_AND_ABOVE ); GenerativeModel gm = new GenerativeModel ( "gemini-1.5-flash" , BuildConfig . apiKey , null , // generation config is optional Arrays . asList ( harassmentSafety , hateSpeechSafety ) ); GenerativeModelFutures model = GenerativeModelFutures . from ( gm ); REST echo '{ "safetySettings": [ {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_ONLY_HIGH"}, {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": \ No newline at end of file diff --git a/docstore/1bffe781-84f6-400e-ba61-4199f6add0d9 b/docstore/1bffe781-84f6-400e-ba61-4199f6add0d9 new file mode 100644 index 0000000000000000000000000000000000000000..20761df3fca752fa9d82e2d418d8f0a38b8dac4d --- /dev/null +++ b/docstore/1bffe781-84f6-400e-ba61-4199f6add0d9 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/video#main-content Title: Generate video using Veo | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/1c1b4623-140d-4b4f-9d6f-134f457f70fb b/docstore/1c1b4623-140d-4b4f-9d6f-134f457f70fb new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/1c1b4623-140d-4b4f-9d6f-134f457f70fb @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/1c4b56b1-2cb0-4be1-8d71-3cb577c7c1e0 b/docstore/1c4b56b1-2cb0-4be1-8d71-3cb577c7c1e0 new file mode 100644 index 0000000000000000000000000000000000000000..b56dfbca2298dddb94eb66ba2a6b5e2a4d7109fa --- /dev/null +++ b/docstore/1c4b56b1-2cb0-4be1-8d71-3cb577c7c1e0 @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "' " ${ MIME1_TYPE } " '", "file_uri": ' $file1_uri '}}, { "inline_data": { "mime_type":"' " ${ MIME2_TYPE } " '", "data": "' " $IMAGE2_BASE64 " '" } } ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Object detection From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. Python from google import genai from google.genai import types from PIL import Image import json client = genai . Client () prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image . open ( "/path/to/image.png" ) config = types . GenerateContentConfig ( response_mime_type = "application/json" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , prompt ], config = config ) width , height = image . size bounding_boxes = json . loads ( response . text ) converted_bounding_boxes = [] for bounding_box in bounding_boxes : abs_y1 = int ( bounding_box [ "box_2d" ][ 0 ] / 1000 * height ) abs_x1 = int ( bounding_box [ "box_2d" ][ 1 ] / 1000 * width ) abs_y2 = int ( bounding_box [ "box_2d" ][ 2 ] / 1000 * height ) abs_x2 = int ( bounding_box [ "box_2d" ][ 3 ] / 1000 * width ) converted_bounding_boxes . append ([ abs_x1 , abs_y1 , abs_x2 , abs_y2 ]) print ( "Image size: " , width , height ) print ( "Bounding boxes:" , converted_bounding_boxes ) Note: The model also supports generating bounding boxes based on custom \ No newline at end of file diff --git a/docstore/1c51dd72-f4c0-4a61-9855-bf2a151b204d b/docstore/1c51dd72-f4c0-4a61-9855-bf2a151b204d new file mode 100644 index 0000000000000000000000000000000000000000..38b0d511edf99b6e46520aefe0a8854155d4053f --- /dev/null +++ b/docstore/1c51dd72-f4c0-4a61-9855-bf2a151b204d @@ -0,0 +1 @@ +energetic music, and dimmed the lights to 50% brightness. Let's get this party started! Compositional function calling Compositional or sequential function calling allows Gemini to chain multiple function calls together to fulfill a complex request. For example, to answer "Get the temperature in my current location", the Gemini API might first invoke a get_current_location() function followed by a get_weather() function that takes the location as a parameter. The following example demonstrates how to implement compositional function calling using the Python SDK and automatic function calling. Python This example uses the automatic function calling feature of the google-genai Python SDK. The SDK automatically converts the Python functions to the required schema, executes the function calls when requested by the model, and sends the results back to the model to complete the task. import os from google import genai from google.genai import types # Example Functions def get_weather_forecast ( location : str ) - > dict : """Gets the current weather temperature for a given location.""" print ( f "Tool Call: get_weather_forecast(location= { location } )" ) # TODO: Make API call print ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ) return { "temperature" : 25 , "unit" : "celsius" } # Dummy response def set_thermostat_temperature ( temperature : int ) - > dict : """Sets the thermostat to a desired temperature.""" print ( f "Tool Call: set_thermostat_temperature(temperature= { temperature } )" ) # TODO: Interact with a thermostat API print ( "Tool Response: {'status': 'success'}" ) return { "status" : "success" } # Configure the client and model client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_weather_forecast , set_thermostat_temperature ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to \ No newline at end of file diff --git a/docstore/1c5c459f-301c-4a78-8ab9-f44a2eb412a6 b/docstore/1c5c459f-301c-4a78-8ab9-f44a2eb412a6 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/1c5c459f-301c-4a78-8ab9-f44a2eb412a6 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/1c6a6494-c617-43b3-9602-567249576319 b/docstore/1c6a6494-c617-43b3-9602-567249576319 new file mode 100644 index 0000000000000000000000000000000000000000..257bd26d73ffcf83a4c86e6ef658baba1dfda511 --- /dev/null +++ b/docstore/1c6a6494-c617-43b3-9602-567249576319 @@ -0,0 +1 @@ +brightness of the lights, 0.0 is off, 1.0 is full. Returns: A dictionary containing the new brightness setting. """ return { "brightness" : brightness } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ power_disco_ball_impl , start_music_impl , dim_lights_impl ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Do everything you need to this place into party!" , config = config , ) print ( " \n Example 2: Automatic function calling" ) print ( response . text ) # I've turned on the disco ball, started playing loud and energetic music, and dimmed the lights to 50% brightness. Let's get this party started! Compositional function calling Compositional or sequential function calling allows Gemini to chain multiple function calls together to fulfill a complex request. For example, to answer "Get the temperature in my current location", the Gemini API might first invoke a get_current_location() function followed by a get_weather() function that takes the location as a parameter. The following example demonstrates how to implement compositional function calling using the Python SDK and automatic function calling. Python This example uses the automatic function calling feature of the google-genai Python SDK. The SDK automatically converts the Python functions to the required schema, executes the function calls when requested by the model, and sends the results back to the model to complete the task. import os from google import genai from google.genai import types # Example Functions def get_weather_forecast ( location : str ) - > dict : """Gets the current weather temperature for a given location.""" print ( f "Tool Call: get_weather_forecast(location= { location } )" ) # TODO: Make API call print ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ) return { "temperature" : 25 , "unit" : "celsius" } # Dummy response def set_thermostat_temperature ( \ No newline at end of file diff --git a/docstore/1c786e99-64bf-4ab3-b537-e34905f5c8b1 b/docstore/1c786e99-64bf-4ab3-b537-e34905f5c8b1 new file mode 100644 index 0000000000000000000000000000000000000000..72fc1af4a198f3f6ddd1bd2823f7a9c0e142bbbb --- /dev/null +++ b/docstore/1c786e99-64bf-4ab3-b537-e34905f5c8b1 @@ -0,0 +1 @@ +should be sent to flush any cached audio. The client can resume sending audio data at any time. Python # example audio file to try: # URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" # !wget -q $URL -O sample.pcm import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_bytes = Path ( "sample.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) # if stream gets paused, send: # await session.send_realtime_input(audio_stream_end=True) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // example audio file to try: // URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" // !wget -q $URL -O sample.pcm import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . \ No newline at end of file diff --git a/docstore/1c78d5df-f86a-4677-b0e4-02b7ab153245 b/docstore/1c78d5df-f86a-4677-b0e4-02b7ab153245 new file mode 100644 index 0000000000000000000000000000000000000000..c6aa65d262d73786cc7d106425f2574a0a896d12 --- /dev/null +++ b/docstore/1c78d5df-f86a-4677-b0e4-02b7ab153245 @@ -0,0 +1 @@ +the modal, you can use the sliders to adjust the content filtering level per safety category: Note: If you set any of the category filters to Block none , Google AI Studio will display a reminder about the Gemini API's Terms of Service with respect to safety settings. When you send a request (for example, by asking the model a question), a warning No Content message appears if the request's content is blocked. To see more details, hold the pointer over the No Content text and click warning Safety . Gemini API SDKs The following code snippet shows how to set safety settings in your GenerateContent call. This sets the thresholds for the harassment ( HARM_CATEGORY_HARASSMENT ) and hate speech ( HARM_CATEGORY_HATE_SPEECH ) categories. For example, setting these categories to BLOCK_LOW_AND_ABOVE blocks any content that has a low or higher probability of being harassment or hate speech. To understand the threshold settings, see Safety filtering per request . Python from google import genai from google.genai import types import PIL.Image img = PIL . Image . open ( "cookies.jpg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ 'Do these look store-bought or homemade?' , img ], config = types . GenerateContentConfig ( safety_settings = [ types . SafetySetting ( category = types . HarmCategory . HARM_CATEGORY_HATE_SPEECH , threshold = types . HarmBlockThreshold . BLOCK_LOW_AND_ABOVE , ), ] ) ) print ( response . text ) Go package main import ( "context" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { SafetySettings : [] * genai . SafetySetting { { Category : "HARM_CATEGORY_HATE_SPEECH" , Threshold : "BLOCK_LOW_AND_ABOVE" , }, }, } response , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , genai . Text ( "Some potentially \ No newline at end of file diff --git a/docstore/1c96101f-14d6-4398-8530-d94e71f1008d b/docstore/1c96101f-14d6-4398-8530-d94e71f1008d new file mode 100644 index 0000000000000000000000000000000000000000..6753b1d43ae21393f7ea777aad5afc4becddf540 --- /dev/null +++ b/docstore/1c96101f-14d6-4398-8530-d94e71f1008d @@ -0,0 +1 @@ +response will include a thought_signature field containing an encrypted representation of the model's reasoning. Return the signature: When you send the function's execution result back to the server, include the thought_signature you received. This allows the model to restore its previous thinking context and will likely result in better function calling performance. Receiving signatures from the server Signatures are returned in the part after the model's thinking phase, which typically is a text or function call. Here are some examples of what thought signatures look like returned in each type of part, in response to the request "What's the weather in Lake Tahoe?" using the Get Weather example: Text part [{ "candidates" : [ { "content" : { "parts" : [ { "text" : "Here's what the weather in Lake Tahoe is today" , "thoughtSignature" : "ClcBVKhc7ru7KzUI7SrdUoIdAYLm/+i93aHjfIt4xHyAoO/G70tApxnK2ujBhOhC1PrRy1pkQa88fqFvpHNVd1HDjNLO7mkp6/hFwE+SPPEB3fh0hs4oM8MKhgIBVKhc7uIGvrS7i/T4HpfbnYrluFfWNjZ62gewqe4cVdR/Dlh+zbjtYmDD0gPZ+SuBO7vvHQdzsjePRP+2Y5XddX6LEf/cGGgakq8EhVvw/a6IVzUO6XmpHg2Ag1sl8E9+VFH/lC0R0ZuYdFWligtDuYwp5p5q3o59G0TtWeU2MC1y2MJfE9u/KWd313ldka80/X2W/xF2O/4djMp5G2WKcULfve75zeRCy0mc5iS3SB9mTH0cT6x0vtKjeBx50gcg+CQWtJcRuwTVzz54dmvmK9xvnqA8gKGw3DuaM9wfy5hyY7Qg0z3iyyWdP8T/lbjKim8IEQOk7O1vVwP1Ko7oMYH8JgA1CsoBAVSoXO6v4c5RSyd1cn6EIU0pEFQsjW7rYWPuZdOFq/tsGJT9BCfW7KGkPGwlNSq8jTJFvbcJ/DjtndISQYXwiXd2kGa5JfdS2Kh4zOxCxiWtOk+2nCc3+XQk2nonhO+esGJpkDdbbHZSqRgcUtYKq7q28iPFOQvOFyCiZNB7K86Z/6Hnagu2snSlN/BcTMaFGaWpcCClSUo4foRZn3WbNCoM8rcpD7qEJMp4a5baaSxyyeL1ZTGd2HLpFys/oiW6e3oAnhxuIysCwg==" } ] , "role" : "model" } , "index" : 0 } ] , # Remainder of response... Function call part [{ "candidates" : [ { "content" : { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/1cbc958e-a91a-41ce-916a-8473eba7d510 b/docstore/1cbc958e-a91a-41ce-916a-8473eba7d510 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/1cbc958e-a91a-41ce-916a-8473eba7d510 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/1cd22c8d-e9da-463a-aee7-c3bf04f183f9 b/docstore/1cd22c8d-e9da-463a-aee7-c3bf04f183f9 new file mode 100644 index 0000000000000000000000000000000000000000..6753b1d43ae21393f7ea777aad5afc4becddf540 --- /dev/null +++ b/docstore/1cd22c8d-e9da-463a-aee7-c3bf04f183f9 @@ -0,0 +1 @@ +response will include a thought_signature field containing an encrypted representation of the model's reasoning. Return the signature: When you send the function's execution result back to the server, include the thought_signature you received. This allows the model to restore its previous thinking context and will likely result in better function calling performance. Receiving signatures from the server Signatures are returned in the part after the model's thinking phase, which typically is a text or function call. Here are some examples of what thought signatures look like returned in each type of part, in response to the request "What's the weather in Lake Tahoe?" using the Get Weather example: Text part [{ "candidates" : [ { "content" : { "parts" : [ { "text" : "Here's what the weather in Lake Tahoe is today" , "thoughtSignature" : "ClcBVKhc7ru7KzUI7SrdUoIdAYLm/+i93aHjfIt4xHyAoO/G70tApxnK2ujBhOhC1PrRy1pkQa88fqFvpHNVd1HDjNLO7mkp6/hFwE+SPPEB3fh0hs4oM8MKhgIBVKhc7uIGvrS7i/T4HpfbnYrluFfWNjZ62gewqe4cVdR/Dlh+zbjtYmDD0gPZ+SuBO7vvHQdzsjePRP+2Y5XddX6LEf/cGGgakq8EhVvw/a6IVzUO6XmpHg2Ag1sl8E9+VFH/lC0R0ZuYdFWligtDuYwp5p5q3o59G0TtWeU2MC1y2MJfE9u/KWd313ldka80/X2W/xF2O/4djMp5G2WKcULfve75zeRCy0mc5iS3SB9mTH0cT6x0vtKjeBx50gcg+CQWtJcRuwTVzz54dmvmK9xvnqA8gKGw3DuaM9wfy5hyY7Qg0z3iyyWdP8T/lbjKim8IEQOk7O1vVwP1Ko7oMYH8JgA1CsoBAVSoXO6v4c5RSyd1cn6EIU0pEFQsjW7rYWPuZdOFq/tsGJT9BCfW7KGkPGwlNSq8jTJFvbcJ/DjtndISQYXwiXd2kGa5JfdS2Kh4zOxCxiWtOk+2nCc3+XQk2nonhO+esGJpkDdbbHZSqRgcUtYKq7q28iPFOQvOFyCiZNB7K86Z/6Hnagu2snSlN/BcTMaFGaWpcCClSUo4foRZn3WbNCoM8rcpD7qEJMp4a5baaSxyyeL1ZTGd2HLpFys/oiW6e3oAnhxuIysCwg==" } ] , "role" : "model" } , "index" : 0 } ] , # Remainder of response... Function call part [{ "candidates" : [ { "content" : { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/1d09a352-2621-47f1-91df-59af55225adc b/docstore/1d09a352-2621-47f1-91df-59af55225adc new file mode 100644 index 0000000000000000000000000000000000000000..5ec9ece3755d9588328841497099189b5d8c6b8d --- /dev/null +++ b/docstore/1d09a352-2621-47f1-91df-59af55225adc @@ -0,0 +1 @@ +5. Forever Blooms Task Give me a simple list of just the things that I must bring on a camping trip. The list should have 5 items. 1. **Tent:** Your shelter from the elements. 2. **Sleeping Bag:** Stay warm and comfortable. 3. **Sleeping Pad:** Insulation and cushioning for a good night's sleep. 4. **Headlamp or Flashlight:** Light for nighttime activities. 5. **First Aid Kit:** Be prepared for minor injuries. Entity Classify the following items as [large, small]: Elephant Mouse Snail Here's the classification: * **Elephant:** Large * **Mouse:** Small * **Snail:** Small Partial input completion Generative language models work like an advanced auto completion tool. When you provide partial content, the model can provide the rest of the content or what it thinks is a continuation of that content as a response. When doing so, if you include any examples or context, the model can take those examples or context into account. The following example provides a prompt with an instruction and an entity input: Prompt: For the given order, return a JSON object that has the fields cheeseburger, hamburger, fries, or drink, with the value being the quantity. Order: A burger and a drink. Response: { "cheeseburger": 0, "hamburger": 1, "fries": 0, "drink": 1 } (gemini-2.5-flash) While the model did as prompted, writing out the instructions in natural language can sometimes be challenging and it leaves a lot to the model's interpretation. For example, a restaurants menu might contain many items. To reduce the size of the JSON response, you probably want to omit the items that weren't ordered. In this case, you can give an example and a response prefix and let the model complete it: Prompt: Valid fields are cheeseburger, hamburger, fries, and drink. Order: Give me a cheeseburger and fries Output: ``` { "cheeseburger": 1, "fries": 1 } ``` Order: I want two burgers, a drink, and fries. Output: Response: ``` { "hamburger": 2, "drink": 1, "fries": 1 } ``` (gemini-2.5-flash) Notice how \ No newline at end of file diff --git a/docstore/1d0f1fbf-0056-441c-88cb-114507f8bb96 b/docstore/1d0f1fbf-0056-441c-88cb-114507f8bb96 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/1d0f1fbf-0056-441c-88cb-114507f8bb96 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/1d1e1b2f-4410-46ec-b4ee-1c27381902b7 b/docstore/1d1e1b2f-4410-46ec-b4ee-1c27381902b7 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/1d1e1b2f-4410-46ec-b4ee-1c27381902b7 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/1d2cec0d-6999-40bb-826d-b9627019cee5 b/docstore/1d2cec0d-6999-40bb-826d-b9627019cee5 new file mode 100644 index 0000000000000000000000000000000000000000..70023e78d7b8c9459310371555beec5ba6328e28 --- /dev/null +++ b/docstore/1d2cec0d-6999-40bb-826d-b9627019cee5 @@ -0,0 +1 @@ +results reflects a single function call that the model has requested. To send the results back, include the responses in the same order as they were requested. The Python SDK supports automatic function calling , which automatically converts Python functions to declarations, handles the function call execution and response cycle for you. Following is an example for the disco use case. Note: Automatic Function Calling is a Python SDK only feature at the moment. Python from google import genai from google.genai import types # Actual function implementations def power_disco_ball_impl ( power : bool ) - > dict : """Powers the spinning disco ball. Args: power: Whether to turn the disco ball on or off. Returns: A status dictionary indicating the current state. """ return { "status" : f "Disco ball powered { 'on' if power else 'off' } " } def start_music_impl ( energetic : bool , loud : bool ) - > dict : """Play some music matching the specified parameters. Args: energetic: Whether the music is energetic or not. loud: Whether the music is loud or not. Returns: A dictionary containing the music settings. """ music_type = "energetic" if energetic else "chill" volume = "loud" if loud else "quiet" return { "music_type" : music_type , "volume" : volume } def dim_lights_impl ( brightness : float ) - > dict : """Dim the lights. Args: brightness: The brightness of the lights, 0.0 is off, 1.0 is full. Returns: A dictionary containing the new brightness setting. """ return { "brightness" : brightness } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ power_disco_ball_impl , start_music_impl , dim_lights_impl ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Do everything you need to this place into party!" , config = config , ) print ( " \n Example 2: Automatic function calling" ) print ( response . text ) # I've turned on the disco ball, started playing loud and \ No newline at end of file diff --git a/docstore/1d2fab17-6d24-4294-98dc-68e7ea19477f b/docstore/1d2fab17-6d24-4294-98dc-68e7ea19477f new file mode 100644 index 0000000000000000000000000000000000000000..f8b767c4f0817ec13b72ba69e0f749f993720026 --- /dev/null +++ b/docstore/1d2fab17-6d24-4294-98dc-68e7ea19477f @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/google-search Title: Grounding with Google Search | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/1d370b29-7942-42d8-ba16-313f4fd65537 b/docstore/1d370b29-7942-42d8-ba16-313f4fd65537 new file mode 100644 index 0000000000000000000000000000000000000000..665a477ea8352b1598262b3124a473a18fa8289a --- /dev/null +++ b/docstore/1d370b29-7942-42d8-ba16-313f4fd65537 @@ -0,0 +1 @@ +professional, detailed The following are a few examples of prompts without quality modifiers and the same prompt with quality modifiers. Prompt (no quality modifiers): a photo of a corn stalk Prompt (with quality modifiers): 4k HDR beautiful photo of a corn stalk taken by a professional photographer Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Aspect ratios Imagen image generation lets you set five distinct image aspect ratios. Square (1:1, default) - A standard square photo. Common uses for this aspect ratio include social media posts. Fullscreen (4:3) - This aspect ratio is commonly used in media or film. It is also the dimensions of most old (non-widescreen) TVs and medium format cameras. It captures more of the scene horizontally (compared to 1:1), making it a preferred aspect ratio for photography. Prompt: close up of a musician's fingers playing the piano, black and white film, vintage (4:3 aspect ratio) Prompt: A professional studio photo of french fries for a high end restaurant, in the style of a food magazine (4:3 aspect ratio) Portrait full screen (3:4) - This is the fullscreen aspect ratio rotated 90 degrees. This lets to capture more of the scene vertically compared to the 1:1 aspect ratio. Prompt: a woman hiking, close of her boots reflected in a puddle, large mountains in the background, in the style of an advertisement, dramatic angles (3:4 aspect ratio) Prompt: aerial shot of a river flowing up a mystical valley (3:4 aspect ratio) Widescreen (16:9) - This ratio has replaced 4:3 and is now the most common aspect ratio for TVs, monitors, and mobile phone screens (landscape). Use this aspect ratio when you want to capture more of the background (for example, scenic landscapes). Prompt: a man wearing all white clothing sitting on the beach, close up, golden hour lighting (16:9 aspect ratio) Portrait (9:16) - This ratio is widescreen but rotated. This a relatively new aspect ratio that has been \ No newline at end of file diff --git a/docstore/1d44d6db-82c0-4fa0-b3b6-c8ca0cdba8ba b/docstore/1d44d6db-82c0-4fa0-b3b6-c8ca0cdba8ba new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/1d44d6db-82c0-4fa0-b3b6-c8ca0cdba8ba @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/1d540ff9-13a6-4333-ac08-988243dad901 b/docstore/1d540ff9-13a6-4333-ac08-988243dad901 new file mode 100644 index 0000000000000000000000000000000000000000..aacc2cdd644f4375a683fabb35c29fb0962b3a72 --- /dev/null +++ b/docstore/1d540ff9-13a6-4333-ac08-988243dad901 @@ -0,0 +1 @@ +context include: Video question and answering Video memory, as shown with Google's Project Astra Video captioning Video recommendation systems, by enriching existing metadata with new multimodal understanding Video customization, by looking at a corpus of data and associated video metadata and then removing parts of videos that are not relevant to the viewer Video content moderation Real-time video processing When working with videos, it is important to consider how the videos are processed into tokens , which affects billing and usage limits. You can learn more about prompting with video files in the Prompting guide . Long form audio The Gemini models were the first natively multimodal large language models that could understand audio. Historically, the typical developer workflow would involve stringing together multiple domain specific models, like a speech-to-text model and a text-to-text model, in order to process audio. This led to additional latency required by performing multiple round-trip requests and decreased performance usually attributed to disconnected architectures of the multiple model setup. Some emerging and standard use cases for audio context include: Real-time transcription and translation Podcast / video question and answering Meeting transcription and summarization Voice assistants You can learn more about prompting with audio files in the Prompting guide . Long context optimizations The primary optimization when working with long context and the Gemini models is to use context caching . Beyond the previous impossibility of processing lots of tokens in a single request, the other main constraint was the cost. If you have a "chat with your data" app where a user uploads 10 PDFs, a video, and some work documents, you would historically have to work with a more complex retrieval augmented generation (RAG) tool / framework in order to process these requests and pay a significant amount for tokens moved into the context window. Now, you can cache \ No newline at end of file diff --git a/docstore/1d709b12-3f7c-4ce9-b3da-791c890f60e0 b/docstore/1d709b12-3f7c-4ce9-b3da-791c890f60e0 new file mode 100644 index 0000000000000000000000000000000000000000..05f586b9ee4ba7b248a7cf2844965480ce1e46ee --- /dev/null +++ b/docstore/1d709b12-3f7c-4ce9-b3da-791c890f60e0 @@ -0,0 +1 @@ +Code execution | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Code execution The Gemini API provides a code execution tool that enables the model to generate and run Python code. The model can then learn iteratively from the code execution results until it arrives at a final output. You can use code execution to build applications that benefit from code-based reasoning. For example, you can use code execution to solve equations or process text. You can also use the libraries included in the code execution environment to perform more specialized tasks. Gemini is only able to execute code in Python. You can still ask Gemini to generate code in another language, but the model can't use the code execution tool to run it. Enable code execution To enable code execution, configure the code execution tool on the model. This allows the model to generate and run code. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What is the sum of the first 50 prime numbers? " "Generate and run code for the calculation, and make sure you get all 50." , config = types . GenerateContentConfig ( tools = [ types . Tool ( code_execution = types . ToolCodeExecution )] ), ) for part in response . candidates [ 0 ] . content . parts : if part . text is not None : print ( part . text ) if part . executable_code is not None : print ( part . executable_code . code ) if part . code_execution_result is not None : print ( part . code_execution_result . output ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new \ No newline at end of file diff --git a/docstore/1d87af15-dcc8-48a3-8b3e-21eedbd83743 b/docstore/1d87af15-dcc8-48a3-8b3e-21eedbd83743 new file mode 100644 index 0000000000000000000000000000000000000000..7a617ceacc5e968d9729ffe6ff8f1e15b90d626d --- /dev/null +++ b/docstore/1d87af15-dcc8-48a3-8b3e-21eedbd83743 @@ -0,0 +1 @@ +multiple attempts yield the best results. Keep it short : Limit text to 25 characters or less for optimal generation. Multiple phrases : Experiment with two or three distinct phrases to provide additional information. Avoid exceeding three phrases for cleaner compositions. Prompt: A poster with the text "Summerland" in bold font as a title, underneath this text is the slogan "Summer never felt so good" Guide Placement : While Imagen can attempt to position text as directed, expect occasional variations. This feature is continually improving. Inspire font style : Specify a general font style to subtly influence Imagen's choices. Don't rely on precise font replication, but expect creative interpretations. Font size : Specify a font size or a general indication of size (for example, small , medium , large ) to influence the font size generation. Prompt parameterization To better control output results, you might find it helpful to parameterize the inputs into Imagen. For example, suppose you want your customers to be able to generate logos for their business, and you want to make sure logos are always generated on a solid color background. You also want to limit the options that the client can select from a menu. In this example, you can create a parameterized prompt similar to the following: A {logo_style} logo for a {company_area} company on a solid color background. Include the text {company_name} . In your custom user interface, the customer can input the parameters using a menu, and their chosen value populates the prompt Imagen receives. For example: Prompt: A minimalist logo for a health care company on a solid color background. Include the text Journey . Prompt: A modern logo for a software company on a solid color background. Include the text Silo . Prompt: A traditional logo for a baking company on a solid color background. Include the text Seed . Advanced prompt writing techniques Use the following examples to create more specific prompts based on attributes \ No newline at end of file diff --git a/docstore/1d992f1b-927b-483a-8b84-c194cdbcff57 b/docstore/1d992f1b-927b-483a-8b84-c194cdbcff57 new file mode 100644 index 0000000000000000000000000000000000000000..deed43be9d78353ae146822eb2d40897035c76a7 --- /dev/null +++ b/docstore/1d992f1b-927b-483a-8b84-c194cdbcff57 @@ -0,0 +1 @@ +"What other color sofas would work in my space? can you update the image?" Multi-turn image editing (chat): Keep generating / editing images conversationally. Example prompts: [upload an image of a blue car.] , "Turn this car into a convertible.", "Now change the color to yellow." Limitations For best performance, use the following languages: EN, es-MX, ja-JP, zh-CN, hi-IN. Image generation does not support audio or video inputs. Image generation may not always trigger: The model may output text only. Try asking for image outputs explicitly (e.g. "generate an image", "provide images as you go along", "update the image"). The model may stop generating partway through. Try again or try a different prompt. When generating text for an image, Gemini works best if you first generate the text and then ask for an image with the text. There are some regions/countries where Image generation is not available. See Models for more information. Generate images using the Imagen models This example demonstrates generating images with an Imagen model : Python from google import genai from google.genai import types from PIL import Image from io import BytesIO client = genai . Client () response = client . models . generate_images ( model = 'imagen-4.0-generate-preview-06-06' , prompt = 'Robot holding a red skateboard' , config = types . GenerateImagesConfig ( number_of_images = 4 , ) ) for generated_image in response . generated_images : generated_image . image . show () JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . generateImages ({ model : 'imagen-4.0-generate-preview-06-06' , prompt : 'Robot holding a red skateboard' , config : { numberOfImages : 4 , }, }); let idx = 1 ; for ( const generatedImage of response . generatedImages ) { let imgBytes = generatedImage . image . imageBytes ; const buffer = Buffer . from ( imgBytes , "base64" ); fs . \ No newline at end of file diff --git a/docstore/1db66660-c575-4eb6-80a3-10ea6b693574 b/docstore/1db66660-c575-4eb6-80a3-10ea6b693574 new file mode 100644 index 0000000000000000000000000000000000000000..0a87c36dd70c54ff062f2d96388cf5344fb768a1 --- /dev/null +++ b/docstore/1db66660-c575-4eb6-80a3-10ea6b693574 @@ -0,0 +1 @@ +process the response and check for Function Call, if Yes : Extract the name and args of the function and execute the corresponding function in your application. No: The model has provided a direct text response to the prompt (this flow is less emphasized in the example but is a possible outcome). Create User friendly response: If a function was executed, capture the result and send it back to the model in a subsequent turn of the conversation. It will use the result to generate a final, user-friendly response that incorporates the information from the function call. This process can be repeated over multiple turns, allowing for complex interactions and workflows. The model also supports calling multiple functions in a single turn ( parallel function calling ) and in sequence ( compositional function calling ). Step 1: Define a function declaration Define a function and its declaration within your application code that allows users to set light values and make an API request. This function could call external services or APIs. Python # Define a function that the model can call to control smart lights set_light_values_declaration = { "name" : "set_light_values" , "description" : "Sets the brightness and color temperature of a light." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "integer" , "description" : "Light level from 0 to 100. Zero is off and 100 is full brightness" , }, "color_temp" : { "type" : "string" , "enum" : [ "daylight" , "cool" , "warm" ], "description" : "Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`." , }, }, "required" : [ "brightness" , "color_temp" ], }, } # This is the actual function that would be called based on the model's suggestion def set_light_values ( brightness : int , color_temp : str ) - > dict [ str , int | str ]: """Set the brightness and color temperature of a room light. (mock API). Args: brightness: Light level from 0 to 100. Zero is off and 100 is full \ No newline at end of file diff --git a/docstore/1ddd39ef-d4b0-422f-9a12-26a1dc06b824 b/docstore/1ddd39ef-d4b0-422f-9a12-26a1dc06b824 new file mode 100644 index 0000000000000000000000000000000000000000..1426f6277d87da029e324e49b5a4fcb88dde544c --- /dev/null +++ b/docstore/1ddd39ef-d4b0-422f-9a12-26a1dc06b824 @@ -0,0 +1 @@ +live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Receiving a message before the session disconnects The server sends a GoAway message that signals that the current connection will soon be terminated. This message includes the timeLeft , indicating the remaining time and lets you take further action before the connection will be terminated as ABORTED. Python async for response in session . receive (): if response . go_away is not None : # The connection will soon be terminated print ( response . go_away . time_left ) JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . goAway ) { console . debug ( 'Time left: %s\n' , turn . goAway . timeLeft ); } } Receiving a message when the generation is complete The server sends a generationComplete message that signals that the model finished generating the response. Python async for response in session . receive (): if response . server_content . generation_complete is True : # The generation is complete JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . generationComplete ) { // The generation is complete } } What's next Explore more ways to work with the Live API in the full Capabilities guide, the Tool use page, or the Live API cookbook . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/1dde0da1-7a8f-47d2-b5b5-2f64d5fa1a2d b/docstore/1dde0da1-7a8f-47d2-b5b5-2f64d5fa1a2d new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/1dde0da1-7a8f-47d2-b5b5-2f64d5fa1a2d @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/1de2ddfd-88f8-4c7d-b249-f96779d329ea b/docstore/1de2ddfd-88f8-4c7d-b249-f96779d329ea new file mode 100644 index 0000000000000000000000000000000000000000..91fd6dbcb4d807434a341e23a941ef0850298bc5 --- /dev/null +++ b/docstore/1de2ddfd-88f8-4c7d-b249-f96779d329ea @@ -0,0 +1 @@ +transcript' , config = types . GenerateContentConfig ( cached_content = apollo_cache . name , ) ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const filePath = path . join ( media , "a11.txt" ); const document = await ai . files . upload ({ file : filePath , config : { mimeType : "text/plain" }, }); console . log ( "Uploaded file name:" , document . name ); const modelName = "gemini-1.5-flash" ; const contents = [ createUserContent ( createPartFromUri ( document . uri , document . mimeType )), ]; const cache = await ai . caches . create ({ model : modelName , config : { contents : contents , systemInstruction : "You are an expert analyzing transcripts." , }, }); console . log ( "Cache created:" , cache ); const response = await ai . models . generateContent ({ model : modelName , contents : "Please summarize this transcript" , config : { cachedContent : cache . name }, }); console . log ( "Response text:" , response . text ); Count tokens Count the number of tokens in a request. Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . count_tokens ( 'The quick brown fox jumps over the lazy dog.' ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY+); const model = genAI.getGenerativeModel({ model: " gemini - 1.5 - flash ", }); // Count tokens in a prompt without calling text generation. const countResult = await model.countTokens( " The quick brown fox jumps over the lazy dog . ", ); console.log(countResult.totalTokens); // 11 const generateResult = await model.generateContent( " The quick brown fox jumps over the lazy dog . " , ); // On the response for `generateContent`, use `usageMetadata` // to get separate input and output token counts // (`promptTokenCount` and `candidatesTokenCount`, respectively), // as well as the combined token count \ No newline at end of file diff --git a/docstore/1de5b5bb-f226-4dcf-b8e9-8d1473f32a45 b/docstore/1de5b5bb-f226-4dcf-b8e9-8d1473f32a45 new file mode 100644 index 0000000000000000000000000000000000000000..1b590529498df147a73d1ae4a22f439d08a36cb6 --- /dev/null +++ b/docstore/1de5b5bb-f226-4dcf-b8e9-8d1473f32a45 @@ -0,0 +1 @@ +from "node:fs" ; const ai = new GoogleGenAI ({}); const base64ImageFile = fs . readFileSync ( "path/to/small-sample.jpg" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "image/jpeg" , data : base64ImageFile , }, }, { text : "Caption this image." }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go bytes , _ := os . ReadFile ( "path/to/small-sample.jpg" ) parts := [] * genai . Part { genai . NewPartFromBytes ( bytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST IMG_PATH = "/path/to/your/image1.jpg" if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "' " $( base64 $B64FLAGS $IMG_PATH ) " '" } }, {"text": "Caption this image."}, ] }] }' 2 > /dev/null You can also fetch an image from a URL, convert it to bytes, and pass it to generateContent as shown in the following examples. Python from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests . get ( image_path ) . content image = types . Part . from_bytes ( data = image_bytes , mime_type = "image/jpeg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is this image?" , image ], ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; \ No newline at end of file diff --git a/docstore/1dfef05f-5d64-4cac-b8a0-710504d4b9e0 b/docstore/1dfef05f-5d64-4cac-b8a0-710504d4b9e0 new file mode 100644 index 0000000000000000000000000000000000000000..18e5380dd4144398b3d4c6273920669cbc2b0130 --- /dev/null +++ b/docstore/1dfef05f-5d64-4cac-b8a0-710504d4b9e0 @@ -0,0 +1 @@ +'gemini-2.0-flash' , contents = 'Tell me a story in 100 words.' , config = types . GenerateContentConfig ( system_instruction = 'you are a story teller for kids under 5 years old' , max_output_tokens = 400 , top_k = 2 , top_p = 0.5 , temperature = 0.5 , response_mime_type = 'application/json' , stop_sequences = [ ' \n ' ], seed = 42 , ), ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Tell me a story about a magic backpack." , config : { candidateCount : 1 , stopSequences : [ "x" ], maxOutputTokens : 20 , temperature : 1.0 , }, }); console . log ( response . text ); Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , genai . Text ( "Tell me about New York" ), & genai . GenerateContentConfig { Temperature : genai . Ptr [ float32 ]( 0.5 ), TopP : genai . Ptr [ float32 ]( 0.5 ), TopK : genai . Ptr [ float32 ]( 2.0 ), ResponseMIMEType : "application/json" , StopSequences : [] string { "Yankees" }, CandidateCount : 2 , Seed : genai . Ptr [ int32 ]( 42 ), MaxOutputTokens : 128 , PresencePenalty : genai . Ptr [ float32 ]( 0.5 ), FrequencyPenalty : genai . Ptr [ float32 ]( 0.5 ), }, ) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing response Safety settings Generate a response with safety settings: Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ( 'say something bad' , safety_settings = { 'HATE' : 'BLOCK_ONLY_HIGH' , 'HARASSMENT' : 'BLOCK_ONLY_HIGH' , } ) JavaScript import { GoogleGenerativeAI , HarmCategory , HarmBlockThreshold } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI \ No newline at end of file diff --git a/docstore/1e18fae1-343e-4d75-819c-42e45d905352 b/docstore/1e18fae1-343e-4d75-819c-42e45d905352 new file mode 100644 index 0000000000000000000000000000000000000000..b56dfbca2298dddb94eb66ba2a6b5e2a4d7109fa --- /dev/null +++ b/docstore/1e18fae1-343e-4d75-819c-42e45d905352 @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "' " ${ MIME1_TYPE } " '", "file_uri": ' $file1_uri '}}, { "inline_data": { "mime_type":"' " ${ MIME2_TYPE } " '", "data": "' " $IMAGE2_BASE64 " '" } } ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Object detection From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. Python from google import genai from google.genai import types from PIL import Image import json client = genai . Client () prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image . open ( "/path/to/image.png" ) config = types . GenerateContentConfig ( response_mime_type = "application/json" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , prompt ], config = config ) width , height = image . size bounding_boxes = json . loads ( response . text ) converted_bounding_boxes = [] for bounding_box in bounding_boxes : abs_y1 = int ( bounding_box [ "box_2d" ][ 0 ] / 1000 * height ) abs_x1 = int ( bounding_box [ "box_2d" ][ 1 ] / 1000 * width ) abs_y2 = int ( bounding_box [ "box_2d" ][ 2 ] / 1000 * height ) abs_x2 = int ( bounding_box [ "box_2d" ][ 3 ] / 1000 * width ) converted_bounding_boxes . append ([ abs_x1 , abs_y1 , abs_x2 , abs_y2 ]) print ( "Image size: " , width , height ) print ( "Bounding boxes:" , converted_bounding_boxes ) Note: The model also supports generating bounding boxes based on custom \ No newline at end of file diff --git a/docstore/1e1b8bdb-9ce9-4956-9b95-530d751eba39 b/docstore/1e1b8bdb-9ce9-4956-9b95-530d751eba39 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/1e1b8bdb-9ce9-4956-9b95-530d751eba39 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/1e24681f-7394-4700-9b48-e16e7a6c2987 b/docstore/1e24681f-7394-4700-9b48-e16e7a6c2987 new file mode 100644 index 0000000000000000000000000000000000000000..fcd49fd9d1e1bfa6316de012e7df2b50b5ffda8c --- /dev/null +++ b/docstore/1e24681f-7394-4700-9b48-e16e7a6c2987 @@ -0,0 +1 @@ +from_cached_content ( cached_content = apollo_cache ) response = apollo_model . generate_content ( "Find a lighthearted moment from this transcript" ) JavaScript import { GoogleAICacheManager , GoogleAIFileManager } from "@google/generative-ai/server" ; import { GoogleGenerativeAI } from "@google/generative-ai" ; const cacheManager = new GoogleAICacheManager ( "GOOGLE_API_KEY" ); const fileManager = new GoogleAIFileManager ( "GOOGLE_API_KEY" ); const uploadResult = await fileManager . uploadFile ( "path/to/a11.txt" , { mimeType : "text/plain" , }); const cacheResult = await cacheManager . create ({ model : "models/gemini-1.5-flash" , contents : [ { role : "user" , parts : [ { fileData : { fileUri : uploadResult . file . uri , mimeType : uploadResult . file . mimeType , }, }, ], }, ], }); console . log ( cacheResult ); const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModelFromCachedContent ( cacheResult ); const result = await model . generateContent ( "Please summarize this transcript." , ); console . log ( result . response . text ()); After Python import requests import pathlib from google import genai from google.genai import types client = genai . Client () # Check which models support caching. for m in client . models . list (): for action in m . supported_actions : if action == "createCachedContent" : print ( m . name ) break # Download file response = requests . get ( 'https://storage.googleapis.com/generativeai-downloads/data/a11.txt' ) pathlib . Path ( 'a11.txt' ) . write_text ( response . text ) # Upload file document = client . files . upload ( file = 'a11.txt' ) # Create cache model = 'gemini-1.5-flash-001' apollo_cache = client . caches . create ( model = model , config = { 'contents' : [ document ], 'system_instruction' : 'You are an expert at analyzing transcripts.' , }, ) # Generate response response = client . models . generate_content ( model = model , contents = 'Find a lighthearted moment from this \ No newline at end of file diff --git a/docstore/1e3af6f4-6293-4ade-b2c8-0883f0762788 b/docstore/1e3af6f4-6293-4ade-b2c8-0883f0762788 new file mode 100644 index 0000000000000000000000000000000000000000..465170b0f695eefaa56d7f3871f4f3df2424da62 --- /dev/null +++ b/docstore/1e3af6f4-6293-4ade-b2c8-0883f0762788 @@ -0,0 +1 @@ +brightness color_temp: Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. Returns: A dictionary containing the set brightness and color temperature. """ return { "brightness" : brightness , "colorTemperature" : color_temp } JavaScript import { Type } from '@google/genai' ; // Define a function that the model can call to control smart lights const setLightValuesFunctionDeclaration = { name : 'set_light_values' , description : 'Sets the brightness and color temperature of a light.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'Light level from 0 to 100. Zero is off and 100 is full brightness' , }, color_temp : { type : Type . STRING , enum : [ 'daylight' , 'cool' , 'warm' ], description : 'Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`.' , }, }, required : [ 'brightness' , 'color_temp' ], }, }; /** * Set the brightness and color temperature of a room light. (mock API) * @param {number} brightness - Light level from 0 to 100. Zero is off and 100 is full brightness * @param {string} color_temp - Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. * @return {Object} A dictionary containing the set brightness and color temperature. */ function setLightValues ( brightness , color_temp ) { return { brightness : brightness , colorTemperature : color_temp }; } Step 2: Call the model with function declarations Once you have defined your function declarations, you can prompt the model to use them. It analyzes the prompt and function declarations and decides whether to respond directly or to call a function. If a function is called, the response object will contain a function call suggestion. Python from google.genai import types # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ set_light_values_declaration ]) config = types . GenerateContentConfig ( tools = [ \ No newline at end of file diff --git a/docstore/1e50c33a-d2b5-4a55-94d5-8419010c25c7 b/docstore/1e50c33a-d2b5-4a55-94d5-8419010c25c7 new file mode 100644 index 0000000000000000000000000000000000000000..bdac38d149644927b7d126aee3930efd52a696eb --- /dev/null +++ b/docstore/1e50c33a-d2b5-4a55-94d5-8419010c25c7 @@ -0,0 +1 @@ +turn_off_the_lights_schema = { 'name' : 'turn_off_the_lights' } prompt = """ Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? """ tools = [ { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} ] await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Light control schemas const turnOnTheLightsSchema = { name : 'turn_on_the_lights' }; const turnOffTheLightsSchema = { name : 'turn_off_the_lights' }; const prompt = ` Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? ` ; const tools = [ { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } ]; await run ( prompt , tools = tools , modality = "AUDIO" ) Function calling modes The Gemini API lets you control how the model uses the provided tools (function declarations). Specifically, you can set the mode within the. function_calling_config . AUTO (Default) : The model decides whether to generate a natural language response or suggest a function call based on the prompt and context. This is the most flexible mode and recommended for most scenarios. ANY : The model is constrained to always predict a function call and guarantees function schema adherence. If allowed_function_names is not specified, the model can choose from any of the provided function declarations. If allowed_function_names is provided as a list, the model can only choose from the functions in that list. Use this mode when you require a function call response to every prompt (if applicable). NONE : The model is prohibited from making function calls. This is equivalent to sending a request without any function declarations. Use this to temporarily disable function calling without removing your tool definitions. Python from google.genai import types # Configure function calling mode tool_config = types . ToolConfig ( \ No newline at end of file diff --git a/docstore/1e6e076b-7c20-4cbe-89d2-d5cab353d072 b/docstore/1e6e076b-7c20-4cbe-89d2-d5cab353d072 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/1e6e076b-7c20-4cbe-89d2-d5cab353d072 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/1e7135e0-ecf5-4238-9b1e-4a1f11cded31 b/docstore/1e7135e0-ecf5-4238-9b1e-4a1f11cded31 new file mode 100644 index 0000000000000000000000000000000000000000..c085d8aece3abc99a010c5a69268bce2397f0e27 --- /dev/null +++ b/docstore/1e7135e0-ecf5-4238-9b1e-4a1f11cded31 @@ -0,0 +1 @@ +100mm Macro lens Model: imagen-3.0-generate-002 Motion Use case Lens type Focal lengths Additional details Sports, wildlife (motion) Telephoto zoom 100-400mm Fast shutter speed, Action or movement tracking Using several keywords from the table, Imagen can generate the following motion images: Prompt: a winning touchdown, fast shutter speed, movement tracking Model: imagen-3.0-generate-002 Prompt: A deer running in the forest, fast shutter speed, movement tracking Model: imagen-3.0-generate-002 Wide-angle Use case Lens type Focal lengths Additional details Astronomical, landscape (wide-angle) Wide-angle 10-24mm Long exposure times, sharp focus, long exposure, smooth water or clouds Using several keywords from the table, Imagen can generate the following wide-angle images: Prompt: an expansive mountain range, landscape wide angle 10mm Model: imagen-3.0-generate-002 Prompt: a photo of the moon, astro photography, wide angle 10mm Model: imagen-3.0-generate-002 What's next Check out the Veo guide to learn how to generate videos with the Gemini API. To learn more about Gemini models, see Gemini models and Experimental models . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/1e779191-e192-4fd5-9bf7-f62e8e87ee85 b/docstore/1e779191-e192-4fd5-9bf7-f62e8e87ee85 new file mode 100644 index 0000000000000000000000000000000000000000..750217269b5a3e53dd2d92de822b68418a6799df --- /dev/null +++ b/docstore/1e779191-e192-4fd5-9bf7-f62e8e87ee85 @@ -0,0 +1 @@ +' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Get metadata for a file You can verify that the API successfully stored the uploaded file and get its metadata by calling files.get . Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) file_name = myfile . name myfile = client . files . get ( name = file_name ) print ( myfile ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; const fetchedFile = await ai . files . get ({ name : fileName }); console . log ( fetchedFile ); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } gotFile , err := client . GetFile ( ctx , file . Name ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Got file:" , gotFile . Name ) REST # file_info.json was created in the upload example name = $( jq ".file.name" file_info.json ) # Get the file of interest to check state curl https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " > file_info.json # Print some information about the file you got name = $( jq ".file.name" file_info.json ) echo name = $name file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri List uploaded files You can upload multiple files using the Files API. The following code gets a list of all the files uploaded: Python print ( 'My files:' ) for f in client . files . list (): print ( ' ' , f . name ) JavaScript const listResponse = await ai . files . list ({ config : { pageSize : 10 } }); for await ( const file of listResponse ) { console . log ( file . name ); } Go iter := client . ListFiles ( ctx ) for { ifile , err := iter . Next () if err == iterator . Done { break } if err != nil { log . Fatal ( err ) } fmt . Println ( ifile . Name ) } REST echo "My files: " curl \ No newline at end of file diff --git a/docstore/1e7b964d-9a09-4b2d-91d6-a27fccd39c46 b/docstore/1e7b964d-9a09-4b2d-91d6-a27fccd39c46 new file mode 100644 index 0000000000000000000000000000000000000000..35cb4724ff25e15eeabaea84a051d2545b73055e --- /dev/null +++ b/docstore/1e7b964d-9a09-4b2d-91d6-a27fccd39c46 @@ -0,0 +1 @@ +model to produce concise responses, you can include examples in the prompt that give preference to concise responses. The following prompt provides two examples that show preference to the shorter explanations. In the response, you can see that the examples guided the model to choose the shorter explanation ( Explanation2 ) as opposed to the longer explanation ( Explanation1 ) like it did previously. Prompt: Below are some examples showing a question, explanation, and answer format: Question: Why is the sky blue? Explanation1: The sky appears blue because of Rayleigh scattering, which causes shorter blue wavelengths of light to be scattered more easily than longer red wavelengths, making the sky look blue. Explanation2: Due to Rayleigh scattering effect. Answer: Explanation2 Question: What is the cause of earthquakes? Explanation1: Sudden release of energy in the Earth's crust. Explanation2: Earthquakes happen when tectonic plates suddenly slip or break apart, causing a release of energy that creates seismic waves that can shake the ground and cause damage. Answer: Explanation1 Now, Answer the following question given the example formats above: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Answer: Explanation2 (gemini-2.5-flash) Optimal number of examples Models like Gemini can often pick up on patterns using a few examples, though you may need to experiment with the number of examples to provide in the prompt for the best results. At the same time, if you include too many examples, the model may start to overfit the response to the examples. Patterns vs anti patterns Using examples to show the model a pattern to follow is more effective than using examples to show the model an anti pattern \ No newline at end of file diff --git a/docstore/1e8f37b2-ea5c-437c-96a5-41222744cc13 b/docstore/1e8f37b2-ea5c-437c-96a5-41222744cc13 new file mode 100644 index 0000000000000000000000000000000000000000..433635003046509e85b7917fbaa1cad75744aec9 --- /dev/null +++ b/docstore/1e8f37b2-ea5c-437c-96a5-41222744cc13 @@ -0,0 +1 @@ +GenerateContentRequest inline_requests = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Tell me a one-sentence joke.' }], 'role' : 'user' }] }, { 'contents' : [{ 'parts' : [{ 'text' : 'Why is the sky blue?' }], 'role' : 'user' }] } ] inline_batch_job = client . batches . create ( model = "models/gemini-2.5-flash" , src = inline_requests , config = { 'display_name' : "inlined-requests-job-1" , }, ) print ( f "Created batch job: { inline_batch_job . name } " ) REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:batchGenerateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -X POST \ -H "Content-Type:application/json" \ -d '{ "batch": { "display_name": "my-batch-requests", "input_config": { "requests": { "requests": [ { "request": {"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}]}, "metadata": { "key": "request-1" } }, { "request": {"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}]}, "metadata": { "key": "request-2" } } ] } } } }' You can use any requests you would use in non-batch (or interactive) mode. For example, you could specify the temperature, system instructions or even pass in other modalities. The following example shows some example inline requests that contain a system instruction for one of the requests: inline_requests_list = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Write a short poem about a cloud.' }]}]}, { 'contents' : [{ 'parts' : [{ 'text' : 'Write a short poem about a cat.' }]}], 'system_instructions' : { 'parts' : [{ 'text' : 'You are a cat. Your name is Neko.' }]}} ] Similarly can also specify tools to use for a request. The following example shows a request that enables the Google Search tool : inline_requests_list = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Who won the euro 1998?' }]}]}, { 'contents' : [{ 'parts' : [{ 'text' : 'Who won the euro 2025?' }]}], 'tools' : [{ 'google_search ' : {}}]} ] Input file For larger sets of requests, prepare a JSON Lines \ No newline at end of file diff --git a/docstore/1ea72c65-8451-4055-95da-049b0aa90e6b b/docstore/1ea72c65-8451-4055-95da-049b0aa90e6b new file mode 100644 index 0000000000000000000000000000000000000000..2730923b4e3c83ce2cbbc44055e777f4a3d323b5 --- /dev/null +++ b/docstore/1ea72c65-8451-4055-95da-049b0aa90e6b @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.0-flash Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/1eaf4491-a01a-45a7-8d97-7f116fc7365b b/docstore/1eaf4491-a01a-45a7-8d97-7f116fc7365b new file mode 100644 index 0000000000000000000000000000000000000000..b3339b694e68c4d7176324567b6e6d7542786980 --- /dev/null +++ b/docstore/1eaf4491-a01a-45a7-8d97-7f116fc7365b @@ -0,0 +1 @@ +YouTube video per day. For the paid tier, there is no limit based on video length. For models before 2.5, you can upload only 1 video per request. For models after 2.5, you can upload a maximum of 10 videos per request. You can only upload public videos (not private or unlisted videos). The following example shows how to include a YouTube URL with a prompt: Python response = client . models . generate_content ( model = 'models/gemini-2.0-flash' , contents = types . Content ( parts = [ types . Part ( file_data = types . FileData ( file_uri = 'https://www.youtube.com/watch?v=9hE5-98ZeCg' ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( process . env . GOOGLE_API_KEY ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-pro" }); const result = await model . generateContent ([ "Please summarize the video in 3 sentences." , { fileData : { fileUri : "https://www.youtube.com/watch?v=9hE5-98ZeCg" , }, }, ]); console . log ( result . response . text ()); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } parts := [] * genai . Part { genai . NewPartFromText ( "Please summarize the video in 3 sentences." ), genai . NewPartFromURI ( "https://www.youtube.com/watch?v=9hE5-98ZeCg" , "video/mp4" ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Please summarize the video \ No newline at end of file diff --git a/docstore/1f01f7a1-45b2-4e06-b5af-c8e65ad21431 b/docstore/1f01f7a1-45b2-4e06-b5af-c8e65ad21431 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/1f01f7a1-45b2-4e06-b5af-c8e65ad21431 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/1f059d4e-1784-4671-b344-31ed851a84d5 b/docstore/1f059d4e-1784-4671-b344-31ed851a84d5 new file mode 100644 index 0000000000000000000000000000000000000000..c085d8aece3abc99a010c5a69268bce2397f0e27 --- /dev/null +++ b/docstore/1f059d4e-1784-4671-b344-31ed851a84d5 @@ -0,0 +1 @@ +100mm Macro lens Model: imagen-3.0-generate-002 Motion Use case Lens type Focal lengths Additional details Sports, wildlife (motion) Telephoto zoom 100-400mm Fast shutter speed, Action or movement tracking Using several keywords from the table, Imagen can generate the following motion images: Prompt: a winning touchdown, fast shutter speed, movement tracking Model: imagen-3.0-generate-002 Prompt: A deer running in the forest, fast shutter speed, movement tracking Model: imagen-3.0-generate-002 Wide-angle Use case Lens type Focal lengths Additional details Astronomical, landscape (wide-angle) Wide-angle 10-24mm Long exposure times, sharp focus, long exposure, smooth water or clouds Using several keywords from the table, Imagen can generate the following wide-angle images: Prompt: an expansive mountain range, landscape wide angle 10mm Model: imagen-3.0-generate-002 Prompt: a photo of the moon, astro photography, wide angle 10mm Model: imagen-3.0-generate-002 What's next Check out the Veo guide to learn how to generate videos with the Gemini API. To learn more about Gemini models, see Gemini models and Experimental models . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/1f172b8a-4b33-41b9-b8d4-01984605c73b b/docstore/1f172b8a-4b33-41b9-b8d4-01984605c73b new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/1f172b8a-4b33-41b9-b8d4-01984605c73b @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/1f410863-5c3a-4fb8-99ae-b6b19ba62e24 b/docstore/1f410863-5c3a-4fb8-99ae-b6b19ba62e24 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/1f410863-5c3a-4fb8-99ae-b6b19ba62e24 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/1f51f422-808b-491e-a947-07b2447e6f3a b/docstore/1f51f422-808b-491e-a947-07b2447e6f3a new file mode 100644 index 0000000000000000000000000000000000000000..487c70e9445bb8c3f55f54d929201b1584ed04dd --- /dev/null +++ b/docstore/1f51f422-808b-491e-a947-07b2447e6f3a @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#veo-2 Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/1f560103-f395-45ff-9b94-0a9570aed315 b/docstore/1f560103-f395-45ff-9b94-0a9570aed315 new file mode 100644 index 0000000000000000000000000000000000000000..e7df3841ca47d093238d5ecfeb9f7cc95942f8e5 --- /dev/null +++ b/docstore/1f560103-f395-45ff-9b94-0a9570aed315 @@ -0,0 +1 @@ +data image2Path := "path/to/image2.jpeg" imgBytes , _ := os . ReadFile ( image2Path ) parts := [] * genai . Part { genai . NewPartFromText ( "What is different between these two images?" ), genai . NewPartFromBytes ( imgBytes , "image/jpeg" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST # Upload the first image IMAGE1_PATH = "path/to/image1.jpg" MIME1_TYPE = $( file -b --mime-type " ${ IMAGE1_PATH } " ) NUM1_BYTES = $( wc -c < " ${ IMAGE1_PATH } " ) DISPLAY_NAME1 = IMAGE1 tmp_header_file1 = upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME1_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME1 } '}}" 2 > /dev/null upload_url1 = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file1 } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file1 } " curl " ${ upload_url1 } " \ -H "Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE1_PATH } " 2 > /dev/null > file_info1.json file1_uri = $( jq ".file.uri" file_info1.json ) echo file1_uri = $file1_uri # Prepare the second image (inline) IMAGE2_PATH = "path/to/image2.png" MIME2_TYPE = $( file -b --mime-type " ${ IMAGE2_PATH } " ) if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMAGE2_BASE64 = $( base64 $B64FLAGS $IMAGE2_PATH ) # Now generate content using both images curl \ No newline at end of file diff --git a/docstore/1f5761dd-de78-4421-bbc9-d7f265d31959 b/docstore/1f5761dd-de78-4421-bbc9-d7f265d31959 new file mode 100644 index 0000000000000000000000000000000000000000..045707d455060dfd20be0644c14272aa57ff277b --- /dev/null +++ b/docstore/1f5761dd-de78-4421-bbc9-d7f265d31959 @@ -0,0 +1 @@ +"log" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } prompt := "Explain the concept of Occam's Razor and provide a simple, everyday example." model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , genai . Text ( prompt ), nil ) fmt . Println ( resp . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain the concept of Occam\' s Razor and provide a simple, everyday example. " } ] } ] }' ``` Thinking budgets The thinkingBudget parameter guides the model on the number of thinking tokens to use when generating a response. A higher token count generally allows for more detailed reasoning, which can be beneficial for tackling more complex tasks . If latency is more important, use a lower budget or disable thinking by setting thinkingBudget to 0. Setting the thinkingBudget to -1 turns on dynamic thinking , meaning the model will adjust the budget based on the complexity of the request. The thinkingBudget is only supported in Gemini 2.5 Flash, 2.5 Pro, and 2.5 Flash-Lite. Depending on the prompt, the model might overflow or underflow the token budget. The following are thinkingBudget configuration details for each model type. Model Default setting (Thinking budget is not set) Range Disable thinking Turn on dynamic thinking 2.5 Pro Dynamic thinking: Model decides when and how much to think 128 to 32768 N/A: Cannot disable thinking thinkingBudget = -1 2.5 Flash Dynamic thinking: Model decides when and how much to think 0 to 24576 thinkingBudget = 0 thinkingBudget = -1 2.5 Flash Lite Model does not think 512 to 24576 thinkingBudget = 0 thinkingBudget = -1 Python from google import genai from google.genai import types client = genai . \ No newline at end of file diff --git a/docstore/1f8409f0-2c50-443e-b762-ec252cc891f6 b/docstore/1f8409f0-2c50-443e-b762-ec252cc891f6 new file mode 100644 index 0000000000000000000000000000000000000000..f1254740842b565388da4397f822e545092a5b88 --- /dev/null +++ b/docstore/1f8409f0-2c50-443e-b762-ec252cc891f6 @@ -0,0 +1 @@ +NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) stream := chat . SendMessageStream ( ctx , genai . Part { Text : "How many paws are in my house?" }) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt = sse \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Supported models All models in the Gemini family support text generation. To learn more about the models \ No newline at end of file diff --git a/docstore/1f98292f-3965-4761-8f3b-ed77bc24f05b b/docstore/1f98292f-3965-4761-8f3b-ed77bc24f05b new file mode 100644 index 0000000000000000000000000000000000000000..ca9fcd920a86bdee4d0b622e9ecd16eba0587472 --- /dev/null +++ b/docstore/1f98292f-3965-4761-8f3b-ed77bc24f05b @@ -0,0 +1 @@ +[{"code_execution": {}}], "contents": [ { "role": "user", "parts": [{ "text": "Can you print \"Hello world!\"?" }] },{ "role": "model", "parts": [ { "text": "" }, { "executable_code": { "language": "PYTHON", "code": "\nprint(\"hello world!\")\n" } }, { "code_execution_result": { "outcome": "OUTCOME_OK", "output": "hello world!\n" } }, { "text": "I have printed \"hello world!\" using the provided python code block. \n" } ], },{ "role": "user", "parts": [{ "text": "What is the sum of the first 50 prime numbers? Generate and run code for the calculation, and make sure you get all 50." }] } ] }' Input/output (I/O) Starting with Gemini 2.0 Flash , code execution supports file input and graph output. Using these input and output capabilities, you can upload CSV and text files, ask questions about the files, and have Matplotlib graphs generated as part of the response. The output files are returned as inline images in the response. I/O pricing When using code execution I/O, you're charged for input tokens and output tokens: Input tokens: User prompt Output tokens: Code generated by the model Code execution output in the code environment Thinking tokens Summary generated by the model I/O details When you're working with code execution I/O, be aware of the following technical details: The maximum runtime of the code environment is 30 seconds. If the code environment generates an error, the model may decide to regenerate the code output. This can happen up to 5 times. The maximum file input size is limited by the model token window. In AI Studio, using Gemini Flash 2.0, the maximum input file size is 1 million tokens (roughly 2MB for text files of the supported input types). If you upload a file that's too large, AI Studio won't let you send it. Code execution works best with text and CSV files. The input file can be passed in part.inlineData or part.fileData (uploaded via the Files API ), and the output file is always returned as part.inlineData . Single turn Bidirectional \ No newline at end of file diff --git a/docstore/1fb2ca3b-e831-4f5e-a88e-c5d38823fb68 b/docstore/1fb2ca3b-e831-4f5e-a88e-c5d38823fb68 new file mode 100644 index 0000000000000000000000000000000000000000..7e8ad5dab4abf30afa5f2a7a1bb2bb58fcb2f5d0 --- /dev/null +++ b/docstore/1fb2ca3b-e831-4f5e-a88e-c5d38823fb68 @@ -0,0 +1 @@ +-X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"' " $MIME_TYPE " '", "data": "' " $IMAGE_B64 " '" } }, {"text": "Caption this image."} ] }] }' 2 > /dev/null Note: Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly. Uploading images using the File API For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to generateContent . See the Files API guide for more information and examples. Python from google import genai client = genai . Client () my_file = client . files . upload ( file = "path/to/sample.jpg" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ my_file , "Caption this image." ], ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.jpg" , config : { mimeType : "image/jpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Caption this image." , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.jpg" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Caption this image." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * \ No newline at end of file diff --git a/docstore/1fdec9ad-b40d-4abc-9c29-6ea4f9fa83fb b/docstore/1fdec9ad-b40d-4abc-9c29-6ea4f9fa83fb new file mode 100644 index 0000000000000000000000000000000000000000..e2a63cb652cd12668827d65032b7973c573776f2 --- /dev/null +++ b/docstore/1fdec9ad-b40d-4abc-9c29-6ea4f9fa83fb @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/caching#main-content Title: Context caching | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/1fe34fc4-797d-4913-9555-0d7538da1332 b/docstore/1fe34fc4-797d-4913-9555-0d7538da1332 new file mode 100644 index 0000000000000000000000000000000000000000..155726b0c29beaa4b366bda3c7400e5b8cd47772 --- /dev/null +++ b/docstore/1fe34fc4-797d-4913-9555-0d7538da1332 @@ -0,0 +1 @@ +context that you want to re-use many times, context caching can help reduce the costs associated with asking questions about that information. Does the context length affect the model latency? There is some fixed amount of latency in any given request, regardless of the size, but generally longer queries will have higher latency (time to first token). Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-05-20 UTC. \ No newline at end of file diff --git a/docstore/2001696c-49d2-40ac-9daf-4098949d5aa9 b/docstore/2001696c-49d2-40ac-9daf-4098949d5aa9 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/2001696c-49d2-40ac-9daf-4098949d5aa9 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/201de003-ab3d-4d21-9660-a16876e9eb6e b/docstore/201de003-ab3d-4d21-9660-a16876e9eb6e new file mode 100644 index 0000000000000000000000000000000000000000..f509c38bfdd5532b577aa7eb3d9bd85a497c1f15 --- /dev/null +++ b/docstore/201de003-ab3d-4d21-9660-a16876e9eb6e @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#veo-2 Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/20249199-efaf-4f6e-90da-9bc9953ee84b b/docstore/20249199-efaf-4f6e-90da-9bc9953ee84b new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/20249199-efaf-4f6e-90da-9bc9953ee84b @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/203056a4-0c8e-453f-885d-cb46345a21b9 b/docstore/203056a4-0c8e-453f-885d-cb46345a21b9 new file mode 100644 index 0000000000000000000000000000000000000000..0d6f2bf32b918d3f85636fdb1c53070e3d5509f6 --- /dev/null +++ b/docstore/203056a4-0c8e-453f-885d-cb46345a21b9 @@ -0,0 +1 @@ +overlay_filename = f " { item [ 'label' ] } _ { i } _overlay.png" mask . save ( os . path . join ( output_dir , mask_filename )) # Create and save overlay composite = Image . alpha_composite ( im . convert ( 'RGBA' ), overlay ) composite . save ( os . path . join ( output_dir , overlay_filename )) print ( f "Saved mask and overlay for { item [ 'label' ] } to { output_dir } " ) # Example usage if __name__ == "__main__" : extract_segmentation_masks ( "path/to/image.png" ) Check the segmentation example in the cookbook guide for a more detailed example. An example segmentation output with objects and segmentation masks Supported image formats Gemini supports the following image format MIME types: PNG - image/png JPEG - image/jpeg WEBP - image/webp HEIC - image/heic HEIF - image/heif Capabilities All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation. Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements. Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities: Gemini 2.0 models are further trained to support enhanced object detection . Gemini 2.5 models are further trained to support enhanced segmentation in addition to object detection . Limitations and key technical information File limit Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request. Token calculation Gemini 1.5 Flash and Gemini 1.5 Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens. Gemini 2.0 Flash and Gemini 2.5 Flash/Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each \ No newline at end of file diff --git a/docstore/203946d9-81ab-4ebb-9ce5-773805c28ef5 b/docstore/203946d9-81ab-4ebb-9ce5-773805c28ef5 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/203946d9-81ab-4ebb-9ce5-773805c28ef5 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/204a8999-cb98-46bb-9c81-60ed37fdba3a b/docstore/204a8999-cb98-46bb-9c81-60ed37fdba3a new file mode 100644 index 0000000000000000000000000000000000000000..257bd26d73ffcf83a4c86e6ef658baba1dfda511 --- /dev/null +++ b/docstore/204a8999-cb98-46bb-9c81-60ed37fdba3a @@ -0,0 +1 @@ +brightness of the lights, 0.0 is off, 1.0 is full. Returns: A dictionary containing the new brightness setting. """ return { "brightness" : brightness } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ power_disco_ball_impl , start_music_impl , dim_lights_impl ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Do everything you need to this place into party!" , config = config , ) print ( " \n Example 2: Automatic function calling" ) print ( response . text ) # I've turned on the disco ball, started playing loud and energetic music, and dimmed the lights to 50% brightness. Let's get this party started! Compositional function calling Compositional or sequential function calling allows Gemini to chain multiple function calls together to fulfill a complex request. For example, to answer "Get the temperature in my current location", the Gemini API might first invoke a get_current_location() function followed by a get_weather() function that takes the location as a parameter. The following example demonstrates how to implement compositional function calling using the Python SDK and automatic function calling. Python This example uses the automatic function calling feature of the google-genai Python SDK. The SDK automatically converts the Python functions to the required schema, executes the function calls when requested by the model, and sends the results back to the model to complete the task. import os from google import genai from google.genai import types # Example Functions def get_weather_forecast ( location : str ) - > dict : """Gets the current weather temperature for a given location.""" print ( f "Tool Call: get_weather_forecast(location= { location } )" ) # TODO: Make API call print ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ) return { "temperature" : 25 , "unit" : "celsius" } # Dummy response def set_thermostat_temperature ( \ No newline at end of file diff --git a/docstore/204be81b-b123-4b21-81cf-85bf46eeca84 b/docstore/204be81b-b123-4b21-81cf-85bf46eeca84 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/204be81b-b123-4b21-81cf-85bf46eeca84 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/209a9889-8a94-4f30-9643-14e3257b95de b/docstore/209a9889-8a94-4f30-9643-14e3257b95de new file mode 100644 index 0000000000000000000000000000000000000000..1b38806cd93f51329872622740eee5221d7a7d7e --- /dev/null +++ b/docstore/209a9889-8a94-4f30-9643-14e3257b95de @@ -0,0 +1 @@ +18°C." , config = config , ) # Print the final, user-facing response print ( response . text ) Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . I 've set the thermostat to 20°C. JavaScript This example shows how to use JavaScript/TypeScript SDK to do comopositional function calling using a manual execution loop. import { GoogleGenAI , Type } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Example Functions function get_weather_forecast ({ location }) { console . log ( `Tool Call: get_weather_forecast(location= ${ location } )` ); // TODO: Make API call console . log ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ); return { temperature : 25 , unit : "celsius" }; } function set_thermostat_temperature ({ temperature }) { console . log ( `Tool Call: set_thermostat_temperature(temperature= ${ temperature } )` , ); // TODO: Make API call console . log ( "Tool Response: {'status': 'success'}" ); return { status : "success" }; } const toolFunctions = { get_weather_forecast , set_thermostat_temperature , }; const tools = [ { functionDeclarations : [ { name : "get_weather_forecast" , description : "Gets the current weather temperature for a given location." , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , }, }, required : [ "location" ], }, }, { name : "set_thermostat_temperature" , description : "Sets the thermostat to a desired temperature." , parameters : { type : Type . OBJECT , properties : { temperature : { type : Type . NUMBER , }, }, required : [ \ No newline at end of file diff --git a/docstore/20c1261f-a40a-4ca9-a960-49a4552f27d1 b/docstore/20c1261f-a40a-4ca9-a960-49a4552f27d1 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/20c1261f-a40a-4ca9-a960-49a4552f27d1 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/20cfe992-c009-4b2b-a407-3c870bd7d781 b/docstore/20cfe992-c009-4b2b-a407-3c870bd7d781 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/20cfe992-c009-4b2b-a407-3c870bd7d781 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/20d0c18c-c934-4977-94e7-e7b49a7f3fef b/docstore/20d0c18c-c934-4977-94e7-e7b49a7f3fef new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/20d0c18c-c934-4977-94e7-e7b49a7f3fef @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/20edf501-58c8-4862-a411-dffa58060c7e b/docstore/20edf501-58c8-4862-a411-dffa58060c7e new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/20edf501-58c8-4862-a411-dffa58060c7e @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/20f1a74a-1f02-4b6e-866e-c8aea979675a b/docstore/20f1a74a-1f02-4b6e-866e-c8aea979675a new file mode 100644 index 0000000000000000000000000000000000000000..bebef94a2670f1eebe768f4cb4680db515bfe743 --- /dev/null +++ b/docstore/20f1a74a-1f02-4b6e-866e-c8aea979675a @@ -0,0 +1 @@ +it's recommended to provide a single message summary to free up the context window for subsequent interactions. See Session Resumption for another method for loading session context. Sending and receiving audio The most common audio example, audio-to-audio , is covered in the Getting started guide. Here's an audio-to-text example that reads a WAV file, sends it in the correct format and receives text output: Python # Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav # Install helpers for converting files: pip install librosa soundfile import asyncio import io from pathlib import Path from google import genai from google.genai import types import soundfile as sf import librosa client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : buffer = io . BytesIO () y , sr = librosa . load ( "sample.wav" , sr = 16000 ) sf . write ( buffer , y , sr , format = 'RAW' , subtype = 'PCM_16' ) buffer . seek ( 0 ) audio_bytes = buffer . read () # If already in correct format, you can use this: # audio_bytes = Path("sample.pcm").read_bytes() await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav // Install helpers for converting files: npm install wavefile import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue \ No newline at end of file diff --git a/docstore/210b0bd1-3220-4229-9733-d1860695a2de b/docstore/210b0bd1-3220-4229-9733-d1860695a2de new file mode 100644 index 0000000000000000000000000000000000000000..c19e2d1f2a8624bd1e7529d33c658d3154e40942 --- /dev/null +++ b/docstore/210b0bd1-3220-4229-9733-d1860695a2de @@ -0,0 +1 @@ +response. A token is approximately four characters. 100 tokens correspond to roughly 60-80 words. Temperature: The temperature controls the degree of randomness in token selection. The temperature is used for sampling during response generation, which occurs when topP and topK are applied. Lower temperatures are good for prompts that require a more deterministic or less open-ended response, while higher temperatures can lead to more diverse or creative results. A temperature of 0 is deterministic, meaning that the highest probability response is always selected. topK : The topK parameter changes how the model selects tokens for output. A topK of 1 means the selected token is the most probable among all the tokens in the model's vocabulary (also called greedy decoding), while a topK of 3 means that the next token is selected from among the 3 most probable using the temperature. For each token selection step, the topK tokens with the highest probabilities are sampled. Tokens are then further filtered based on topP with the final token selected using temperature sampling. topP : The topP parameter changes how the model selects tokens for output. Tokens are selected from the most to least probable until the sum of their probabilities equals the topP value. For example, if tokens A, B, and C have a probability of 0.3, 0.2, and 0.1 and the topP value is 0.5, then the model will select either A or B as the next token by using the temperature and exclude C as a candidate. The default topP value is 0.95. stop_sequences : Set a stop sequence to tell the model to stop generating content. A stop sequence can be any sequence of characters. Try to avoid using a sequence of characters that may appear in the generated content. Prompt iteration strategies Prompt design can sometimes require a few iterations before you consistently get the response you're looking for. This section provides guidance on some things you can try when iterating on your prompts: Use different phrasing: \ No newline at end of file diff --git a/docstore/2113a4e6-393e-42d1-8992-2c516ebf43df b/docstore/2113a4e6-393e-42d1-8992-2c516ebf43df new file mode 100644 index 0000000000000000000000000000000000000000..3d32a6c6f44782138d2600dc9a5e7c5bf75a9a24 --- /dev/null +++ b/docstore/2113a4e6-393e-42d1-8992-2c516ebf43df @@ -0,0 +1 @@ +in 3 sentences."}, { "file_data": { "file_uri": "https://www.youtube.com/watch?v=9hE5-98ZeCg" } } ] }] }' 2 > /dev/null Refer to timestamps in the content You can ask questions about specific points in time within the video using timestamps of the form MM:SS . Python prompt = "What are the examples given at 00:05 and 00:10 supposed to show us?" # Adjusted timestamps for the NASA video JavaScript const prompt = "What are the examples given at 00:05 and 00:10 supposed to show us?" ; Go prompt := [] * genai . Part { genai . NewPartFromURI ( currentVideoFile . URI , currentVideoFile . MIMEType ), // Adjusted timestamps for the NASA video genai . NewPartFromText ( "What are the examples given at 00:05 and " + "00:10 supposed to show us?" ), } REST PROMPT = "What are the examples given at 00:05 and 00:10 supposed to show us?" Transcribe video and provide visual descriptions The Gemini models can transcribe and provide visual descriptions of video content by processing both the audio track and visual frames. For visual descriptions, the model samples the video at a rate of 1 frame per second . This sampling rate may affect the level of detail in the descriptions, particularly for videos with rapidly changing visuals. Python prompt = "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions." JavaScript const prompt = "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions." ; Go prompt := [] * genai . Part { genai . NewPartFromURI ( currentVideoFile . URI , currentVideoFile . MIMEType ), genai . NewPartFromText ( "Transcribe the audio from this video, giving timestamps for salient events in the video. Also " + "provide visual descriptions." ), } REST PROMPT = "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions." Customize video processing You can customize video processing \ No newline at end of file diff --git a/docstore/2117ae83-dcb8-4bcc-a945-e02986eb13f8 b/docstore/2117ae83-dcb8-4bcc-a945-e02986eb13f8 new file mode 100644 index 0000000000000000000000000000000000000000..be865665baa597a5b341e658abc6f47e616f09e1 --- /dev/null +++ b/docstore/2117ae83-dcb8-4bcc-a945-e02986eb13f8 @@ -0,0 +1 @@ +OpenAI compatibility | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback OpenAI compatibility Gemini models are accessible using the OpenAI libraries (Python and TypeScript / Javascript) along with the REST API, by updating three lines of code and using your Gemini API key . If you aren't already using the OpenAI libraries, we recommend that you call the Gemini API directly . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.5-flash" , messages = [ { "role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Explain to me how AI works" } ] ) print ( response . choices [ 0 ] . message ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); const response = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : [ { role : "system" , content : "You are a helpful assistant." }, { role : "user" , content : "Explain to me how AI works" , }, ], }); console . log ( response . choices [ 0 ]. message ); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.0-flash", "messages": [ {"role": "user", "content": "Explain to me how AI works"} ] }' What changed? Just three lines! api_key="GEMINI_API_KEY" : Replace " GEMINI_API_KEY " with your actual Gemini API key, \ No newline at end of file diff --git a/docstore/2118c198-e509-49fc-9c26-f7d06fdddea9 b/docstore/2118c198-e509-49fc-9c26-f7d06fdddea9 new file mode 100644 index 0000000000000000000000000000000000000000..4a588ba82271aaa08f2df12295049c23bd57f7eb --- /dev/null +++ b/docstore/2118c198-e509-49fc-9c26-f7d06fdddea9 @@ -0,0 +1 @@ +, ) print ( response . text ) # The SDK handles the function call and returns the final text You can disable automatic function calling with: Python config = types . GenerateContentConfig ( tools = [ get_current_temperature ], automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ) ) Automatic function schema declaration Automatic schema extraction from Python functions doesn't work in all cases. For example, it doesn't handle cases where you describe the fields of a nested dictionary-object. The API is able to describe any of the following types: Python AllowedType = ( int | float | bool | str | list [ 'AllowedType' ] | dict [ str , AllowedType ]) To see what the inferred schema looks like, you can convert it using from_callable : Python def multiply ( a : float , b : float ): """Returns a * b.""" return a * b fn_decl = types . FunctionDeclaration . from_callable ( callable = multiply , client = client ) # to_json_dict() provides a clean JSON representation. print ( fn_decl . to_json_dict ()) Multi-tool use: Combine native tools with function calling You can enable multiple tools combining native tools with function calling at the same time. Here's an example that enables two tools, Grounding with Google Search and code execution , in a request using the Live API . Note: Multi-tool use is a- Live API only feature at the moment. The run() function declaration, which handles the asynchronous websocket setup, is omitted for brevity. Python # Multiple tasks example - combining lights, code execution, and search prompt = """ Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! """ tools = [ { 'google_search' : {}}, { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} # not defined here. \ No newline at end of file diff --git a/docstore/214bc6a5-ea4f-4fd2-a343-7dab4acd30bc b/docstore/214bc6a5-ea4f-4fd2-a343-7dab4acd30bc new file mode 100644 index 0000000000000000000000000000000000000000..3c7819093774ca7626711ba692d14925f51fa93f --- /dev/null +++ b/docstore/214bc6a5-ea4f-4fd2-a343-7dab4acd30bc @@ -0,0 +1 @@ +text embeddings: Python from google import genai client = genai . Client () result = client . models . embed_content ( model = "gemini-embedding-exp-03-07" , contents = "What is the meaning of life?" ) print ( result . embeddings ) JavaScript import { GoogleGenAI } from "@google/genai" ; async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . embedContent ({ model : 'gemini-embedding-exp-03-07' , contents : 'What is the meaning of life?' , }); console . log ( response . embeddings ); } main (); Go package main import ( "context" "encoding/json" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := [] * genai . Content { genai . NewContentFromText ( "What is the meaning of life?" , genai . RoleUser ), } result , err := client . Models . EmbedContent ( ctx , "gemini-embedding-exp-03-07" , contents , nil , ) if err != nil { log . Fatal ( err ) } embeddings , err := json . MarshalIndent ( result . Embeddings , "" , " " ) if err != nil { log . Fatal ( err ) } fmt . Println ( string ( embeddings )) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-embedding-exp-03-07:embedContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{"model": "models/gemini-embedding-exp-03-07", "content": { "parts":[{ "text": "What is the meaning of life?"}]} }' You can also generate embeddings for multiple chunks at once by passing them in as a list of strings. Task types When building Retrieval Augmented Generation (RAG) systems, a common design is to use text embeddings to perform a similarity search. In some cases this can lead to degraded quality, because questions and their answers are not semantically similar. For example, a question like "Why is the sky blue?" and its answer "The scattering of sunlight causes the blue color," have distinctly different \ No newline at end of file diff --git a/docstore/214ebc98-6631-49e8-96c2-60b994cfb512 b/docstore/214ebc98-6631-49e8-96c2-60b994cfb512 new file mode 100644 index 0000000000000000000000000000000000000000..72fc1af4a198f3f6ddd1bd2823f7a9c0e142bbbb --- /dev/null +++ b/docstore/214ebc98-6631-49e8-96c2-60b994cfb512 @@ -0,0 +1 @@ +should be sent to flush any cached audio. The client can resume sending audio data at any time. Python # example audio file to try: # URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" # !wget -q $URL -O sample.pcm import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_bytes = Path ( "sample.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) # if stream gets paused, send: # await session.send_realtime_input(audio_stream_end=True) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // example audio file to try: // URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" // !wget -q $URL -O sample.pcm import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . \ No newline at end of file diff --git a/docstore/2156ac23-33f0-4412-806e-3fb463a54874 b/docstore/2156ac23-33f0-4412-806e-3fb463a54874 new file mode 100644 index 0000000000000000000000000000000000000000..933572d214295ba3502f9da18dc8703a97691ed7 --- /dev/null +++ b/docstore/2156ac23-33f0-4412-806e-3fb463a54874 @@ -0,0 +1 @@ +captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). * Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds, and their general characteristics. * Thesis Statement: State the main point of the essay, focusing on the remarkable adaptations, behaviors, and ecological significance that make hummingbirds extraordinary. ..... (gemini-2.5-flash) Zero-shot vs few-shot prompts You can include examples in the prompt that show the model what getting it right looks like. The model attempts to identify patterns and relationships from the examples and applies them when generating a response. Prompts that contain a few examples are called few-shot prompts, while prompts that provide no examples are called zero-shot prompts. Few-shot prompts are often used to regulate the formatting, phrasing, scoping, or general patterning of model responses. Use specific and varied examples to help the model narrow its focus and generate more accurate results. We recommend to always include few-shot examples in your prompts. Prompts without few-shot examples are likely to be less effective. In fact, you can remove instructions from your prompt if your examples are clear enough in showing the task at hand. The following zero-shot prompt asks the model to choose the best explanation. Prompt: Please choose the best explanation to the question: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Explanation1 is the better explanation because it provides more detail on the process, including how ice crystals combine and grow into snowflakes as they fall through the atmosphere. (gemini-2.5-flash) If your use case requires the \ No newline at end of file diff --git a/docstore/2172cea5-26c5-422d-91c0-a3820464a1d3 b/docstore/2172cea5-26c5-422d-91c0-a3820464a1d3 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/2172cea5-26c5-422d-91c0-a3820464a1d3 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/2174e23b-04d0-42ae-8d70-e910d58ec7b8 b/docstore/2174e23b-04d0-42ae-8d70-e910d58ec7b8 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/2174e23b-04d0-42ae-8d70-e910d58ec7b8 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/217788f4-7732-4f8b-b66d-e2f62a9f71e6 b/docstore/217788f4-7732-4f8b-b66d-e2f62a9f71e6 new file mode 100644 index 0000000000000000000000000000000000000000..0a30470486c76dc6778a9edd107dc1585facf068 --- /dev/null +++ b/docstore/217788f4-7732-4f8b-b66d-e2f62a9f71e6 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/function-calling#main-content Title: Function calling with the Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/2178b4ce-e350-4771-8755-28cd68467cbc b/docstore/2178b4ce-e350-4771-8755-28cd68467cbc new file mode 100644 index 0000000000000000000000000000000000000000..c651df5a8dc16b6ecacfcfbb978b8a98fa66456b --- /dev/null +++ b/docstore/2178b4ce-e350-4771-8755-28cd68467cbc @@ -0,0 +1 @@ +Live API capabilities guide | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Live API capabilities guide Preview: The Live API is in preview. This is a comprehensive guide that covers capabilities and configurations available with the Live API. See Get started with Live API page for a overview and sample code for common use cases. Before you begin Familiarize yourself with core concepts: If you haven't already done so, read the Get started with Live API page first. This will introduce you to the fundamental principles of the Live API, how it works, and the distinction between the different models and their corresponding audio generation methods ( native audio or half-cascade). Try the Live API in AI Studio: You may find it useful to try the Live API in Google AI Studio before you start building. To use the Live API in Google AI Studio, select Stream . Establishing a connection The following example shows how to create a connection with an API key: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : print ( "Session started" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function main () { const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug \ No newline at end of file diff --git a/docstore/2194e4b1-cefd-48d1-b676-4b2a9e58871c b/docstore/2194e4b1-cefd-48d1-b676-4b2a9e58871c new file mode 100644 index 0000000000000000000000000000000000000000..a99dcaa8b13f3b83504ad1b35c796f8fc297615c --- /dev/null +++ b/docstore/2194e4b1-cefd-48d1-b676-4b2a9e58871c @@ -0,0 +1 @@ +] # Execute the prompt with specified tools in audio modality await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Multiple tasks example - combining lights, code execution, and search const prompt = ` Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! ` ; const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } // not defined here. ]; // Execute the prompt with specified tools in audio modality await run ( prompt , { tools : tools , modality : "AUDIO" }); Python developers can try this out in the Live API Tool Use notebook . Model context protocol (MCP) Model Context Protocol (MCP) is an open standard for connecting AI applications with external tools and data. MCP provides a common protocol for models to access context, such as functions (tools), data sources (resources), or predefined prompts. The Gemini SDKs have built-in support for the MCP, reducing boilerplate code and offering automatic tool calling for MCP tools. When the model generates an MCP tool call, the Python and JavaScript client SDK can automatically execute the MCP tool and send the response back to the model in a subsequent request, continuing this loop until no more tool calls are made by the model. Here, you can find an example of how to use a local MCP server with Gemini and mcp SDK. Python Make sure the latest version of the mcp SDK is installed on your platform of choice. pip install mcp Note: Python supports automatic tool calling by passing in the ClientSession into the tools parameters. If you want to disable it, you can provide automatic_function_calling with disabled True . import os import asyncio from datetime import datetime from mcp import ClientSession , StdioServerParameters from \ No newline at end of file diff --git a/docstore/21aa38b5-f78c-4f52-bb96-7abbc0677712 b/docstore/21aa38b5-f78c-4f52-bb96-7abbc0677712 new file mode 100644 index 0000000000000000000000000000000000000000..485847fd8e226bc46bd8d42c44cd3e8dd100fb7e --- /dev/null +++ b/docstore/21aa38b5-f78c-4f52-bb96-7abbc0677712 @@ -0,0 +1 @@ +supported on the interactive (or non-batch mode) API. Pricing: Batch Mode usage is priced at 50% of the standard interactive API cost for the equivalent model. Service Level Objective (SLO): Batch jobs are designed to complete within a 24-hour turnaround time. Many jobs may complete much faster depending on their size and current system load. Caching: Context caching is enabled for batch requests. If a request in your batch results in a cache hit, the cached tokens are priced the same as for non-batch mode traffic. Best practices Use input files for large requests: For a large number of requests, always use the file input method for better manageability and to avoid hitting request size limits for the BatchGenerateContent call itself. Note that there's a the 2GB file size limit per input file. Error handling: Check the batchStats for failedRequestCount after a job completes. If using file output, parse each line to check if it's a GenerateContentResponse or a status object indicating an error for that specific request. Submit jobs once: The creation of a batch job is not idempotent. If you send the same creation request twice, two separate batch jobs will be created. Break up very large batches: While the target turnaround time is 24 hours, actual processing time can vary based on system load and job size. For large jobs, consider breaking them into smaller batches if intermediate results are needed sooner. What's next Check out the batch mode notebook for more examples. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/21b28c01-019c-40b6-bc2b-aac02b9f4c11 b/docstore/21b28c01-019c-40b6-bc2b-aac02b9f4c11 new file mode 100644 index 0000000000000000000000000000000000000000..c72243f466a39b8c76a4237da0daa8b99ecbbe13 --- /dev/null +++ b/docstore/21b28c01-019c-40b6-bc2b-aac02b9f4c11 @@ -0,0 +1 @@ +}, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST # Use a temporary file to hold the base64 encoded image data TEMP_B64 = $( mktemp ) trap 'rm -f "$TEMP_B64"' EXIT base64 $B64FLAGS $IMG_PATH > " $TEMP_B64 " # Use a temporary file to hold the JSON payload TEMP_JSON = $( mktemp ) trap 'rm -f "$TEMP_JSON"' EXIT cat > " $TEMP_JSON " << EOF { "contents" : [ { "parts" : [ { "text" : "Tell me about this instrument" } , { "inline_data" : { "mime_type" : "image/jpeg" , "data" : " $( cat " $TEMP_B64 " ) " } } ] } ] } EOF curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d "@ $TEMP_JSON " Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const imageUrl = 'http://image/url' ; const image = getImageData ( imageUrl ); const payload = { contents : [ { parts : [ { image }, { text : 'Tell me about this instrument' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } function getImageData ( url ) { const blob = UrlFetchApp . fetch ( url ). getBlob (); return { mimeType : blob . getContentType (), data : Utilities . base64Encode ( blob . getBytes ()) }; } \ No newline at end of file diff --git a/docstore/21bd1f25-23db-4213-9a8d-2a8ee8d8ece2 b/docstore/21bd1f25-23db-4213-9a8d-2a8ee8d8ece2 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/21bd1f25-23db-4213-9a8d-2a8ee8d8ece2 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/21ddc70c-31c1-449e-a695-02af578d57e5 b/docstore/21ddc70c-31c1-449e-a695-02af578d57e5 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/21ddc70c-31c1-449e-a695-02af578d57e5 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/21e31695-6d5b-40d4-9937-bde723540f9c b/docstore/21e31695-6d5b-40d4-9937-bde723540f9c new file mode 100644 index 0000000000000000000000000000000000000000..8d1e7e8a7a566917639b9515f69a698a37f27a8d --- /dev/null +++ b/docstore/21e31695-6d5b-40d4-9937-bde723540f9c @@ -0,0 +1 @@ +to avoid. ⛔ Negative pattern: Example Don't end haikus with a question: Haiku are fun A short and simple poem Don't you enjoy them? ✅ Positive pattern: Example Always end haikus with an assertion: Haiku are fun A short and simple poem A joy to write Consistent formatting Make sure that the structure and formatting of few-shot examples are the same to avoid responses with undesired formats. One of the primary objectives of adding few-shot examples in prompts is to show the model the response format. Therefore, it is essential to ensure a consistent format across all examples, especially paying attention to XML tags, white spaces, newlines, and example splitters. Add context You can include instructions and information in a prompt that the model needs to solve a problem, instead of assuming that the model has all of the required information. This contextual information helps the model understand the constraints and details of what you're asking for it to do. The following example asks the model to give troubleshooting guidance for a router: Prompt: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Response: A yellow blinking light on your Google Wifi router usually indicates a problem with the internet connection. Here are some steps to troubleshoot and fix the issue: 1. Restart your router and modem: - Unplug your modem from the power outlet and wait for 30 seconds. - Plug your modem back in and wait for it to fully boot up. - Unplug your Google Wifi router and wait for 30 seconds. - Plug your Google Wifi router back in and wait for it to fully boot up. ..... (gemini-2.5-flash) The response looks like generic troubleshooting information that's not specific to the router or the status of the LED indicator lights. To customize the response for the specific router, you can add to the prompt the router's troubleshooting guide as context for it to refer to when providing a response. Prompt: Answer the question \ No newline at end of file diff --git a/docstore/222f98fb-dc30-4348-8bdd-c43abaf2b4e6 b/docstore/222f98fb-dc30-4348-8bdd-c43abaf2b4e6 new file mode 100644 index 0000000000000000000000000000000000000000..c25aa08b43110ad03ba57bdce48865495fdfb941 --- /dev/null +++ b/docstore/222f98fb-dc30-4348-8bdd-c43abaf2b4e6 @@ -0,0 +1 @@ +fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ], "generationConfig": { "thinkingConfig": { "thinkingBudget": 0 } } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } System instructions and other configurations You can guide the behavior of Gemini models with system instructions. To do so, pass a GenerateContentConfig object. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , config = types . GenerateContentConfig ( system_instruction = "You are a cat. Your name is Neko." ), contents = "Hello there" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Hello there" , config : { systemInstruction : "You are a cat. Your name is Neko." , }, }); console . log ( response . text ); } await main (); \ No newline at end of file diff --git a/docstore/223338b5-6716-4af3-97bc-4c0dd2434069 b/docstore/223338b5-6716-4af3-97bc-4c0dd2434069 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/223338b5-6716-4af3-97bc-4c0dd2434069 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/228d82b0-f342-4cbb-8fc3-cb2f560d9b2e b/docstore/228d82b0-f342-4cbb-8fc3-cb2f560d9b2e new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/228d82b0-f342-4cbb-8fc3-cb2f560d9b2e @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/22a2edb2-5b92-4f64-82f2-15750d176d06 b/docstore/22a2edb2-5b92-4f64-82f2-15750d176d06 new file mode 100644 index 0000000000000000000000000000000000000000..f13076550935e70477ccf270f1bed1e7b9f11dc0 --- /dev/null +++ b/docstore/22a2edb2-5b92-4f64-82f2-15750d176d06 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/image-understanding#upload-image Title: Image understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/22af8acc-57c5-4432-a88b-e1a9019396cd b/docstore/22af8acc-57c5-4432-a88b-e1a9019396cd new file mode 100644 index 0000000000000000000000000000000000000000..c3dd5ccab4f70d7a2a8a4ffb78dc41dee62c1a3f --- /dev/null +++ b/docstore/22af8acc-57c5-4432-a88b-e1a9019396cd @@ -0,0 +1 @@ +with texts to make sense, use whatever order is most natural. Troubleshooting your multimodal prompt If the model is not drawing information from the relevant part of the image: Drop hints with which aspects of the image you want the prompt to draw information from. If the model output is too generic (not tailored enough to the image/video input): At the start of the prompt, try asking the model to describe the image(s) or video before providing the task instruction, or try asking the model to refer to what's in the image. To troubleshoot which part failed: Ask the model to describe the image, or ask the model to explain its reasoning, to gauge the model's initial understanding. If your prompt results in hallucinated content: Try dialing down the temperature setting or asking the model for shorter descriptions so that it's less likely to extrapolate additional details. Tuning the sampling parameters: Experiment with different temperature settings and top-k selections to adjust the model's creativity. Be specific in your instructions Prompts have the most success when they are clear and detailed. If you have a specific output in mind, it's better to include that requirement in the prompt to ensure you get the output you want. For this image of an airport board, asking the model to just "describe this image" could generate a general description. If you need the model to parse the time and city from the image, you can include that request directly in your prompt. Prompt Model response Describe this image. The image shows an airport arrivals and departures board. Updated prompt Improved response Parse the time and city from the airport board shown in this image into a list. 10:50 Moscow 11:05 Edinburgh 11:05 London 11:10 Bucharest 11:30 Kiev 11:35 Dublin 11:45 East Midlands 12:15 Sofia 12:30 London 12:30 Newcastle 12:40 St Petersburg 12:40 London 12:45 Manchester Add a few examples The Gemini model can accept multiple inputs which it can use as examples to understand \ No newline at end of file diff --git a/docstore/22ce81c4-21a4-45f8-85b5-19a2805ae6d7 b/docstore/22ce81c4-21a4-45f8-85b5-19a2805ae6d7 new file mode 100644 index 0000000000000000000000000000000000000000..c1215478fcfc0a791e03023378a8264733510dd8 --- /dev/null +++ b/docstore/22ce81c4-21a4-45f8-85b5-19a2805ae6d7 @@ -0,0 +1 @@ +connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.pcm" ); const base64Audio = Buffer . from ( fileBuffer ). toString ( 'base64' ); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); // if stream gets paused, send: // session.sendRealtimeInput({ audioStreamEnd: true }) const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); With send_realtime_input , the API will respond to audio automatically based on VAD. While send_client_content adds messages to the model context in order, send_realtime_input is optimized for responsiveness at the expense of deterministic ordering. Automatic VAD configuration For more control over the VAD activity, you can configure the following parameters. See API reference for more info. Python from google.genai import types config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : False , # default "start_of_speech_sensitivity" : types . StartSensitivity . START_SENSITIVITY_LOW , "end_of_speech_sensitivity" : types . EndSensitivity . END_SENSITIVITY_LOW , "prefix_padding_ms" : 20 , "silence_duration_ms" : 100 , } } } JavaScript import { GoogleGenAI , Modality , StartSensitivity , EndSensitivity } from '@google/genai' ; const config = { responseModalities : [ Modality . \ No newline at end of file diff --git a/docstore/22d8a3ad-f300-4408-85ab-9e5e2aa55b7f b/docstore/22d8a3ad-f300-4408-85ab-9e5e2aa55b7f new file mode 100644 index 0000000000000000000000000000000000000000..d276bcbbbbd8ffd587f83aebc6d230d4e5c5b078 --- /dev/null +++ b/docstore/22d8a3ad-f300-4408-85ab-9e5e2aa55b7f @@ -0,0 +1 @@ +Gemini Developer API Pricing | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Gemini Developer API Pricing The Gemini API "free tier" is offered through the API service with lower rate limits for testing purposes. Google AI Studio usage is completely free in all available countries. The Gemini API "paid tier" comes with higher rate limits , additional features, and different data handling. Upgrade to the Paid Tier If you're looking to reduce costs and your use case doesn't require immediate real-time responses, check out Batch Mode . Batch Mode is designed to process large volumes of requests asynchronously. Requests submitted using this mode is 50% of the price of interactive (non-batch mode) requests. Gemini 2.5 Pro Try it in Google AI Studio Our state-of-the-art multipurpose model, which excels at coding and complex reasoning tasks. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge $1.25, prompts <= 200k tokens $2.50, prompts > 200k tokens Output price (including thinking tokens) Free of charge $10.00, prompts <= 200k tokens $15.00, prompts > 200k Context caching price Not available $0.31, prompts <= 200k tokens $0.625, prompts > 200k $4.50 / 1,000,000 tokens per hour (storage price) Grounding with Google Search Not available 1,500 RPD (free), then $35 / 1,000 requests Used to improve our products Yes No Gemini 2.5 Flash Try it in Google AI Studio Our first hybrid reasoning model which supports a 1M token context window and has thinking budgets. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge $0.30 (text / image / video) $1.00 (audio) Output price (including thinking tokens) Free of charge $2.50 Context \ No newline at end of file diff --git a/docstore/22f9697f-63c5-43bd-a343-246f62011c75 b/docstore/22f9697f-63c5-43bd-a343-246f62011c75 new file mode 100644 index 0000000000000000000000000000000000000000..448fb0d86f65fba6c1ebada31e49a4124dab31aa --- /dev/null +++ b/docstore/22f9697f-63c5-43bd-a343-246f62011c75 @@ -0,0 +1 @@ +message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "16000.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . log ( "Transcription" ) console . log ( turn . serverContent . outputTranscription . text ); } } for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } else if ( turn . serverContent && turn . serverContent . inputTranscription ) { console . debug ( 'Received input transcription: %s\n' , turn . serverContent . inputTranscription . text ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Stream audio and video To see an example of how to use the Live API in a streaming audio and video format, run the "Live API - Get Started" file in the cookbooks repository: View on Colab Change voice and language The Live API models each support a different set of voices. Half-cascade supports Puck, Charon, Kore, Fenrir, Aoede, Leda, Orus, and Zephyr. Native audio supports a much longer list (identical to the TTS model list \ No newline at end of file diff --git a/docstore/2340c4f9-4ef1-45d3-ab40-22e714133137 b/docstore/2340c4f9-4ef1-45d3-ab40-22e714133137 new file mode 100644 index 0000000000000000000000000000000000000000..58c080b28fe0c0a0f77a553f5c6816f2c420fdd2 --- /dev/null +++ b/docstore/2340c4f9-4ef1-45d3-ab40-22e714133137 @@ -0,0 +1 @@ +the sum of the first 50 prime numbers? " "Generate and run code for the calculation, and make sure you get all 50." ) for part in response . candidates [ 0 ] . content . parts : if part . text is not None : print ( part . text ) if part . executable_code is not None : print ( part . executable_code . code ) if part . code_execution_result is not None : print ( part . code_execution_result . output ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "I have a math question for you:" }], }, { role : "model" , parts : [{ text : "Great! I'm ready for your math question. Please ask away." }], }, ], config : { tools : [{ codeExecution : {}}], } }); const response = await chat . sendMessage ({ message : "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and make sure you get all 50." }); console . log ( "Chat response:" , response . text ); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { Tools : [] * genai . Tool { { CodeExecution : & genai . ToolCodeExecution {}}, }, } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , config , nil , ) result , _ := chat . SendMessage ( ctx , genai . Part { Text : "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and " + "make sure you get all 50." , }, ) fmt . Println ( result . Text ()) fmt . Println ( result . ExecutableCode ()) fmt . Println ( result . CodeExecutionResult ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{"tools": \ No newline at end of file diff --git a/docstore/2373f0a2-5239-489f-9895-b649a676cffa b/docstore/2373f0a2-5239-489f-9895-b649a676cffa new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/2373f0a2-5239-489f-9895-b649a676cffa @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/237e6d97-13ef-42a7-917e-439509244467 b/docstore/237e6d97-13ef-42a7-917e-439509244467 new file mode 100644 index 0000000000000000000000000000000000000000..276b272d06bde6f464a4287876b38d8d2bc17eb7 --- /dev/null +++ b/docstore/237e6d97-13ef-42a7-917e-439509244467 @@ -0,0 +1 @@ +Migrate to the Google GenAI SDK | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Migrate to the Google GenAI SDK Starting with the Gemini 2.0 release in late 2024, we introduced a new set of libraries called the Google GenAI SDK . It offers an improved developer experience through an updated client architecture , and simplifies the transition between developer and enterprise workflows. The Google GenAI SDK is now in General Availability (GA) across all supported platforms. If you're using one of our legacy libraries , we strongly recommend you to migrate. This guide provides before-and-after examples of migrated code to help you get started. Note: The Go examples omit imports and other boilerplate code to improve readability. Installation Before Python pip install -U -q "google-generativeai" JavaScript npm install @google/generative-ai Go go get github.com/google/generative-ai-go After Python pip install -U -q "google-genai" JavaScript npm install @google/genai Go go get google.golang.org/genai API access The old SDK implicitly handled the API client behind the scenes using a variety of ad hoc methods. This made it hard to manage the client and credentials. Now, you interact through a central Client object. This Client object acts as a single entry point for various API services (e.g., models , chats , files , tunings ), promoting consistency and simplifying credential and configuration management across different API calls. Before (Less Centralized API Access) Python The old SDK didn't explicitly use a top-level client object for most API calls. You would directly instantiate and interact with GenerativeModel objects. import \ No newline at end of file diff --git a/docstore/239d7aad-d4e2-43f8-a2a7-e3648bcd6c07 b/docstore/239d7aad-d4e2-43f8-a2a7-e3648bcd6c07 new file mode 100644 index 0000000000000000000000000000000000000000..4780dc08e999a42c290d3d4e3fc44048c74b55ad --- /dev/null +++ b/docstore/239d7aad-d4e2-43f8-a2a7-e3648bcd6c07 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/image-generation#imagen-prompt-guide Title: Image generation | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/23cbe376-b867-4aa7-b09c-93771a7e43a3 b/docstore/23cbe376-b867-4aa7-b09c-93771a7e43a3 new file mode 100644 index 0000000000000000000000000000000000000000..dd1226540612f04fb3f971567b47c61067071189 --- /dev/null +++ b/docstore/23cbe376-b867-4aa7-b09c-93771a7e43a3 @@ -0,0 +1 @@ +This example shows you how to specify a subject description. Subject description Prompt Generated output The description can include a subject, or multiple subjects and actions. Here, our subject is "white concrete apartment building." An architectural rendering of a white concrete apartment building with flowing organic shapes, seamlessly blending with lush greenery and futuristic elements Context This example shows you how to specify context. Context Prompt Generated output The background or context in which the subject will be placed is very important. Try placing your subject in a variety of backgrounds like on a busy street, or in outer space. A satellite floating through outer space with the moon and some stars in the background. Action This example shows you how to specify action. Action Prompt Generated output What is the subject doing like walking, running, or turning their head. A wide shot of a woman walking along the beach, looking content and relaxed towards the horizon at sunset. Style This example shows you how to specify style. Style Prompt Generated output You can add keywords to improve generation quality and steer it closer to intended style, such as shallow depth of field, movie still, minimalistic, surreal, vintage, futuristic, or double-exposure. Film noir style, man and woman walk on the street, mystery, cinematic, black and white. Camera motion This example shows you how to specify camera motion. Camera motion Prompt Generated output Options for camera motion include POV shot, aerial view, tracking drone view, or tracking shot. A POV shot from a vintage car driving in the rain, Canada at night, cinematic. Composition This example shows you how to specify composition. Composition Prompt Generated output How the shot is framed (wide shot, close-up, low angle). Extreme close-up of a an eye with city reflected in it. Create a video of a wide shot of surfer walking on a beach with a surfboard, beautiful sunset, cinematic. Ambiance This example \ No newline at end of file diff --git a/docstore/23dffb46-59b6-4240-adf9-4e142533467a b/docstore/23dffb46-59b6-4240-adf9-4e142533467a new file mode 100644 index 0000000000000000000000000000000000000000..485847fd8e226bc46bd8d42c44cd3e8dd100fb7e --- /dev/null +++ b/docstore/23dffb46-59b6-4240-adf9-4e142533467a @@ -0,0 +1 @@ +supported on the interactive (or non-batch mode) API. Pricing: Batch Mode usage is priced at 50% of the standard interactive API cost for the equivalent model. Service Level Objective (SLO): Batch jobs are designed to complete within a 24-hour turnaround time. Many jobs may complete much faster depending on their size and current system load. Caching: Context caching is enabled for batch requests. If a request in your batch results in a cache hit, the cached tokens are priced the same as for non-batch mode traffic. Best practices Use input files for large requests: For a large number of requests, always use the file input method for better manageability and to avoid hitting request size limits for the BatchGenerateContent call itself. Note that there's a the 2GB file size limit per input file. Error handling: Check the batchStats for failedRequestCount after a job completes. If using file output, parse each line to check if it's a GenerateContentResponse or a status object indicating an error for that specific request. Submit jobs once: The creation of a batch job is not idempotent. If you send the same creation request twice, two separate batch jobs will be created. Break up very large batches: While the target turnaround time is 24 hours, actual processing time can vary based on system load and job size. For large jobs, consider breaking them into smaller batches if intermediate results are needed sooner. What's next Check out the batch mode notebook for more examples. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/23fae627-2f3c-4654-bba9-142b264e1c6c b/docstore/23fae627-2f3c-4654-bba9-142b264e1c6c new file mode 100644 index 0000000000000000000000000000000000000000..1426f6277d87da029e324e49b5a4fcb88dde544c --- /dev/null +++ b/docstore/23fae627-2f3c-4654-bba9-142b264e1c6c @@ -0,0 +1 @@ +live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Receiving a message before the session disconnects The server sends a GoAway message that signals that the current connection will soon be terminated. This message includes the timeLeft , indicating the remaining time and lets you take further action before the connection will be terminated as ABORTED. Python async for response in session . receive (): if response . go_away is not None : # The connection will soon be terminated print ( response . go_away . time_left ) JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . goAway ) { console . debug ( 'Time left: %s\n' , turn . goAway . timeLeft ); } } Receiving a message when the generation is complete The server sends a generationComplete message that signals that the model finished generating the response. Python async for response in session . receive (): if response . server_content . generation_complete is True : # The generation is complete JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . generationComplete ) { // The generation is complete } } What's next Explore more ways to work with the Live API in the full Capabilities guide, the Tool use page, or the Live API cookbook . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/2418a1a8-910b-41e9-8f97-f60044f3ba48 b/docstore/2418a1a8-910b-41e9-8f97-f60044f3ba48 new file mode 100644 index 0000000000000000000000000000000000000000..b410c3d6fc95b8ad9abaf080d511bad548d2b4e0 --- /dev/null +++ b/docstore/2418a1a8-910b-41e9-8f97-f60044f3ba48 @@ -0,0 +1 @@ +get all 50." ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" , tools : [{ codeExecution : {} }], }); const result = await model . generateContent ( "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and make sure you get " + "all 50." , ); console . log ( result . response . text ()); After Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'What is the sum of the first 50 prime numbers? Generate and run ' 'code for the calculation, and make sure you get all 50.' , config = types . GenerateContentConfig ( tools = [ types . Tool ( code_execution = types . ToolCodeExecution )], ), ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContent ({ model : "gemini-2.0-pro-exp-02-05" , contents : `Write and execute code that calculates the sum of the first 50 prime numbers. Ensure that only the executable code and its resulting output are generated.` , }); // Each part may contain text, executable code, or an execution result. for ( const part of response . candidates [ 0 ]. content . parts ) { console . log ( part ); console . log ( "\n" ); } console . log ( "-" . repeat ( 80 )); // The `.text` accessor concatenates the parts into a markdown-formatted text. console . log ( "\n" , response . text ); Search grounding GoogleSearch (Gemini>=2.0) and GoogleSearchRetrieval (Gemini < 2.0) are tools that allow the model to retrieve public web data for grounding, powered by Google. Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ( contents = "what is the \ No newline at end of file diff --git a/docstore/243dee97-df0b-40cd-a56e-4936fe943a28 b/docstore/243dee97-df0b-40cd-a56e-4936fe943a28 new file mode 100644 index 0000000000000000000000000000000000000000..69f7399c35aaaad68e1bd1a996c44353577b3a79 --- /dev/null +++ b/docstore/243dee97-df0b-40cd-a56e-4936fe943a28 @@ -0,0 +1 @@ +the user. if message . server_content and message . server_content . turn_complete : break if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } console . debug ( 'Connecting to the service with handle %s...' , previousSessionHandle ) const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : { responseModalities : [ Modality . TEXT ], sessionResumption : { handle : previousSessionHandle } // The handle of the session to resume is passed here, or else null to start a new session. } }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . sessionResumptionUpdate ) { if ( turn . sessionResumptionUpdate . resumable && turn . sessionResumptionUpdate . newHandle ) { let newHandle = turn . sessionResumptionUpdate . newHandle // ...Store newHandle and start new session with this handle here } } } session . close (); } async function main () { await \ No newline at end of file diff --git a/docstore/2449f2a2-a8df-4e9a-985b-62c0abc097cb b/docstore/2449f2a2-a8df-4e9a-985b-62c0abc097cb new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/2449f2a2-a8df-4e9a-985b-62c0abc097cb @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/2456e501-b3c3-40b4-a16b-68dc85d39357 b/docstore/2456e501-b3c3-40b4-a16b-68dc85d39357 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/2456e501-b3c3-40b4-a16b-68dc85d39357 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/24757e8c-9233-46e1-a6d8-317debfa517d b/docstore/24757e8c-9233-46e1-a6d8-317debfa517d new file mode 100644 index 0000000000000000000000000000000000000000..a6b24f2239a055f1d65a9fc8ecf3b6c3213634db --- /dev/null +++ b/docstore/24757e8c-9233-46e1-a6d8-317debfa517d @@ -0,0 +1 @@ +min | 12/2/2023 | 12/4/2023 | | Physics | In progress | 2 hrs | 12/3/2023 | 12/5/2023 | | Biology | Not started | 6 hrs | 11/2/2016 | 12/6/2023 | Prompt Model response Provide a list of all the following attributes: ingredients, type of cuisine, vegetarian or not, in JSON format ```json { "ingredients": [ "rice", "shrimp", "clams", "mussels", "peas", "tomatoes", "onions", "garlic", "olive oil", "paprika", "salt", "pepper" ], "type of cuisine": "Spanish", "vegetarian": false } ``` Put your image first for single-image prompts While Gemini can interpret image and text in any order within a prompt, placing a single image before the text prompt might lead to better results. In the following samples, you'll notice the image comes first before the text in each prompt. Troubleshooting your multimodal prompt You might need to troubleshoot your prompt if you are not getting a helpful response. Here are a few strategies you could try. If the model is not drawing information from the relevant part of the image To get a more specific response, you can point out which aspects of the image you want the prompt to draw information from to inform its response. Prompt Model response How many days will these diapers last a baby? The diapers will last for 198 days before they run out. Updated prompt Improved response How long will these diapers last before I run out? Use the weight shown on the box to determine the child's age, and use the total number of diapers in the box. Divide the total number by how many diapers the child goes through per day. The box contains 198 diapers. The box states the diapers are for a child weighing 22-37 lbs. The average child in this weight range is likely between 1 and 2 years old. A 2-year-old typically goes through 6 diapers per day, therefore, the diapers will last around 33 days (198/6). If the model output is too generic and not tailored enough to the image input To help the model tailor its response to the image(s), try asking it to describe the \ No newline at end of file diff --git a/docstore/2480dccc-ba27-4543-9bd0-5ea4372389fe b/docstore/2480dccc-ba27-4543-9bd0-5ea4372389fe new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/2480dccc-ba27-4543-9bd0-5ea4372389fe @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/2492cb78-9b11-45d1-bc8f-bbb458a75832 b/docstore/2492cb78-9b11-45d1-bc8f-bbb458a75832 new file mode 100644 index 0000000000000000000000000000000000000000..05f586b9ee4ba7b248a7cf2844965480ce1e46ee --- /dev/null +++ b/docstore/2492cb78-9b11-45d1-bc8f-bbb458a75832 @@ -0,0 +1 @@ +Code execution | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Code execution The Gemini API provides a code execution tool that enables the model to generate and run Python code. The model can then learn iteratively from the code execution results until it arrives at a final output. You can use code execution to build applications that benefit from code-based reasoning. For example, you can use code execution to solve equations or process text. You can also use the libraries included in the code execution environment to perform more specialized tasks. Gemini is only able to execute code in Python. You can still ask Gemini to generate code in another language, but the model can't use the code execution tool to run it. Enable code execution To enable code execution, configure the code execution tool on the model. This allows the model to generate and run code. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What is the sum of the first 50 prime numbers? " "Generate and run code for the calculation, and make sure you get all 50." , config = types . GenerateContentConfig ( tools = [ types . Tool ( code_execution = types . ToolCodeExecution )] ), ) for part in response . candidates [ 0 ] . content . parts : if part . text is not None : print ( part . text ) if part . executable_code is not None : print ( part . executable_code . code ) if part . code_execution_result is not None : print ( part . code_execution_result . output ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new \ No newline at end of file diff --git a/docstore/249c3afa-2b29-421d-8a91-08579ef195b2 b/docstore/249c3afa-2b29-421d-8a91-08579ef195b2 new file mode 100644 index 0000000000000000000000000000000000000000..3d93e98c2388d03b02165eba8a61d6ed193d8415 --- /dev/null +++ b/docstore/249c3afa-2b29-421d-8a91-08579ef195b2 @@ -0,0 +1 @@ +Prompt design strategies | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Prompt design strategies Prompt design is the process of creating prompts, or natural language requests, that elicit accurate, high quality responses from a language model. This page introduces basic concepts, strategies, and best practices to get you started designing prompts to get the most out of Gemini AI models. Topic-specific prompt guides Looking for more specific prompt strategies? Check out our other prompting guides on: Prompting with media files Prompting for image generation Prompting for video generation Google AI Studio also hosts a prompt gallery meant to interactively showcase many of the concepts shared in this guide. Clear and specific instructions An effective and efficient way to customize model behavior is to provide it with clear and specific instructions. Instructions can be in the form of a question, step-by-step tasks, or as complex as mapping out a user's experience and mindset. Input Input is the required text in the prompt that you want the model to provide a response to. Inputs can be a question that the model answers (question input), a task the model performs (task input), an entity the model operates on (entity input), or partial input that the model completes or continues (completion input). Input type Prompt Generated output Question What's a good name for a flower shop that specializes in selling bouquets of dried flowers? Create a list of 5 options with just the names. Here are 10 names for a flower shop specializing in dried flowers: 1. Everlasting Blooms 2. Dried & Delightful 3. The Petal Preserve 4. Whispers of Wildflowers \ No newline at end of file diff --git a/docstore/24ab29b8-3524-4a84-8b2a-c2f390aa5a94 b/docstore/24ab29b8-3524-4a84-8b2a-c2f390aa5a94 new file mode 100644 index 0000000000000000000000000000000000000000..67c383b5aab6b05ef4e22f9b7a5759f708e0820a --- /dev/null +++ b/docstore/24ab29b8-3524-4a84-8b2a-c2f390aa5a94 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/pricing#main-content Title: Gemini Developer API Pricing | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/24bc9a20-239a-45fe-a066-7322f708daaa b/docstore/24bc9a20-239a-45fe-a066-7322f708daaa new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/24bc9a20-239a-45fe-a066-7322f708daaa @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/24dac433-495f-40df-9da0-6b5c85c4687f b/docstore/24dac433-495f-40df-9da0-6b5c85c4687f new file mode 100644 index 0000000000000000000000000000000000000000..71a40ddf8f5869779c79b3a864f86f411dc17d60 --- /dev/null +++ b/docstore/24dac433-495f-40df-9da0-6b5c85c4687f @@ -0,0 +1 @@ +correct reasoning steps afterward. To disambiguate between those reasons, ask the model to describe what's in the image. In the following example, if the model responds with a snack that seems surprising when paired with tea (e.g. popcorn), you can first troubleshoot to determine whether the model correctly recognized that the image contains tea. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? Describe what's in this image. Another strategy is to ask the model to explain its reasoning. That can help you narrow down which part of the reasoning broke down, if any. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? What's a snack I can make in 1 minute that would go well with this? Please explain why. What's next Try writing your own multimodal prompts using Google AI Studio . For information on using the Gemini Files API for uploading media files and including them in your prompts, see the Vision , Audio , and Document processing guides. For more guidance on prompt design, like tuning sampling parameters, see the Prompt strategies page. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/24e2d49f-108b-4e2a-bef5-b7fabe023cf0 b/docstore/24e2d49f-108b-4e2a-bef5-b7fabe023cf0 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/24e2d49f-108b-4e2a-bef5-b7fabe023cf0 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/2536080f-d0c5-4715-a65f-512760c37546 b/docstore/2536080f-d0c5-4715-a65f-512760c37546 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/2536080f-d0c5-4715-a65f-512760c37546 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/2536f5c0-fe1c-4388-83fc-7c0eee60fbef b/docstore/2536f5c0-fe1c-4388-83fc-7c0eee60fbef new file mode 100644 index 0000000000000000000000000000000000000000..4698c2cf5d2dc524303259a813fe032a26136eee --- /dev/null +++ b/docstore/2536f5c0-fe1c-4388-83fc-7c0eee60fbef @@ -0,0 +1 @@ +blurring the background into a sea of neon colors and indistinct shadows, creating a sense of urgency and isolation. A more detailed prompt results in a video that is more focused with a richer environment. A video with smooth motion that dollies in on a desperate man in a green trench coat, using a vintage rotary phone against a wall bathed in an eerie green neon glow. The camera starts from a medium distance, slowly moving closer to the man's face, revealing his frantic expression and the sweat on his brow as he urgently dials the phone. The focus is on the man's hands, his fingers fumbling with the dial as he desperately tries to connect. The green neon light casts long shadows on the wall, adding to the tense atmosphere. The scene is framed to emphasize the isolation and desperation of the man, highlighting the stark contrast between the vibrant glow of the neon and the man's grim determination. Adding more detail gives the subject a realistic expression and creates an intense and vibrant scene. Snow leopard This example demonstrates the output Veo might generate for a simple prompt. Prompt Generated output A cute creature with snow leopard-like fur is walking in winter forest, 3D cartoon style render. Running snow leopard This prompt has more detail and demonstrates generated output that might be closer to what you want in your video. Prompt Generated output Create a short 3D animated scene in a joyful cartoon style. A cute creature with snow leopard-like fur, large expressive eyes, and a friendly, rounded form happily prances through a whimsical winter forest. The scene should feature rounded, snow-covered trees, gentle falling snowflakes, and warm sunlight filtering through the branches. The creature's bouncy movements and wide smile should convey pure delight. Aim for an upbeat, heartwarming tone with bright, cheerful colors and playful animation. Examples by writing elements These examples show you how to refine your prompts by each basic element. Subject \ No newline at end of file diff --git a/docstore/253860c8-f986-43f5-9059-a30548fc4bdd b/docstore/253860c8-f986-43f5-9059-a30548fc4bdd new file mode 100644 index 0000000000000000000000000000000000000000..56a0e0c51541a344a1d82bf14602c14a0c07788e --- /dev/null +++ b/docstore/253860c8-f986-43f5-9059-a30548fc4bdd @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/live#audio-generation Title: Get started with Live API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/257b3937-4101-485e-ae94-f987c7d198dc b/docstore/257b3937-4101-485e-ae94-f987c7d198dc new file mode 100644 index 0000000000000000000000000000000000000000..180b08735f7436a86f48a0e09e82922e8905e52d --- /dev/null +++ b/docstore/257b3937-4101-485e-ae94-f987c7d198dc @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/api-key#set-api-env-var Title: Using Gemini API keys | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/257ed8a6-9b5e-419a-ac4e-733718962a2e b/docstore/257ed8a6-9b5e-419a-ac4e-733718962a2e new file mode 100644 index 0000000000000000000000000000000000000000..122c682d2774097387ed4735af082d43f98d76f5 --- /dev/null +++ b/docstore/257ed8a6-9b5e-419a-ac4e-733718962a2e @@ -0,0 +1 @@ +regions. Can I use 1M tokens in the free tier? The free tier for Gemini API differs based on the model selected. For now, you can try the 1M token context window in the following ways: In Google AI Studio With pay-as-you-go plans With free-of-charge plans for select models See the latest free-of-charge rate limits per model on rate limits page . How can I calculate the number of tokens I'm using? Use the GenerativeModel.count_tokens method to count the number of tokens. Refer to the Tokens guide to learn more about tokens. Can I use my Google Cloud credits with the Gemini API? Yes, Google Cloud credits can be used towards Gemini API usage. How is billing handled? Billing for the Gemini API is handled by the Cloud Billing system. Am I charged for failed requests? If your request fails with a 400 or 500 error, you won't be charged for the tokens used. However, the request will still count against your quota. Is there a charge for fine-tuning the models? Model tuning is free, but inference on tuned models is charged at the same rate as the base models. Is GetTokens billed? Requests to the GetTokens API are not billed, and they don't count against inference quota. How is my Google AI Studio data handled if I have a paid API account? Refer to the terms for details on how data is handled when Cloud billing is enabled (see "How Google Uses Your Data" under "Paid Services"). Note that your Google AI Studio prompts are treated under the same "Paid Services" terms so long as at least 1 API project has billing enabled, which you can validate on the Gemini API Key page if you see any projects marked as "Paid" under "Plan". Where can I get help with billing? To get help with billing, see Get Cloud Billing support . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered \ No newline at end of file diff --git a/docstore/258009ef-afb2-4e51-a382-8224d91b1f6a b/docstore/258009ef-afb2-4e51-a382-8224d91b1f6a new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/258009ef-afb2-4e51-a382-8224d91b1f6a @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/2584505b-1f3c-49f8-a4f1-1cc67e214e30 b/docstore/2584505b-1f3c-49f8-a4f1-1cc67e214e30 new file mode 100644 index 0000000000000000000000000000000000000000..203c1e44cd8be8f0144d13a11f43fb822930ab15 --- /dev/null +++ b/docstore/2584505b-1f3c-49f8-a4f1-1cc67e214e30 @@ -0,0 +1 @@ +] // MCP Server }); const client = new Client ( { name : "example-client" , version : "1.0.0" } ); // Configure the client const ai = new GoogleGenAI ({}); // Initialize the connection between client and server await client . connect ( serverParams ); // Send request to the model with MCP tools const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : `What is the weather in London in ${ new Date (). toLocaleDateString () } ?` , config : { tools : [ mcpToTool ( client )], // uses the session, will automatically call the tool // Uncomment if you **don't** want the sdk to automatically call the tool // automaticFunctionCalling: { // disable: true, // }, }, }); console . log ( response . text ) // Close the connection await client . close (); Limitations with built-in MCP support Built-in MCP support is a experimental feature in our SDKs and has the following limitations: Only tools are supported, not resources nor prompts It is available for the Python and JavaScript/TypeScript SDK. Breaking changes might occur in future releases. Manual integration of MCP servers is always an option if these limit what you're building. Supported models This section lists models and their function calling capabilities. Experimental models are not included. You can find a comprehensive capabilities overview on the model overview page. Model Function Calling Parallel Function Calling Compositional Function Calling Gemini 2.5 Pro ✔️ ✔️ ✔️ Gemini 2.5 Flash ✔️ ✔️ ✔️ Gemini 2.5 Flash-Lite ✔️ ✔️ ✔️ Gemini 2.0 Flash ✔️ ✔️ ✔️ Gemini 2.0 Flash-Lite X X X Best practices Function and Parameter Descriptions: Be extremely clear and specific in your descriptions. The model relies on these to choose the correct function and provide appropriate arguments. Naming: Use descriptive function names (without spaces, periods, or dashes). Strong Typing: Use specific types (integer, string, enum) for parameters to reduce errors. If a parameter has a limited set of valid \ No newline at end of file diff --git a/docstore/258bf72a-5884-4155-ac48-e6fbb1acd8e9 b/docstore/258bf72a-5884-4155-ac48-e6fbb1acd8e9 new file mode 100644 index 0000000000000000000000000000000000000000..d1b83172ef094b37793cd8d72611bb079685a2af --- /dev/null +++ b/docstore/258bf72a-5884-4155-ac48-e6fbb1acd8e9 @@ -0,0 +1 @@ +with Google Search Not available Not available Used to improve our products Yes No Imagen 4 Preview Try it in Google AI Studio Our latest image generation model, with significantly better text rendering and better overall image quality. Preview models may change before becoming stable and have more restrictive rate limits. Free Tier Paid Tier, per Image in USD Imagen 4 Standard image price Not available $0.04 Imagen 4 Ultra image price Not available $0.06 Used to improve our products Yes No Imagen 3 Try it in Google AI Studio Our state-of-the-art image generation model, available to developers on the paid tier of the Gemini API. Free Tier Paid Tier, per Image in USD Image price Not available $0.03 Used to improve our products Yes No Veo 2 Try the API Our state-of-the-art video generation model, available to developers on the paid tier of the Gemini API. Free Tier Paid Tier, per second in USD Video price Not available $0.35 Used to improve our products Yes No Gemma 3 Try Gemma 3 Our lightweight, state-of the art, open model built from the same technology that powers our Gemini models. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge Not available Output price Free of charge Not available Context caching price Free of charge Not available Context caching (storage) Free of charge Not available Tuning price Not available Not available Grounding with Google Search Not available Not available Used to improve our products Yes No Gemma 3n Try Gemma 3n Our open model built for efficient performance on everyday devices like mobile phones, laptops, and tablets. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge Not available Output price Free of charge Not available Context caching price Free of charge Not available Context caching (storage) Free of charge Not available Tuning price Not available Not available Grounding with Google Search Not available Not available Used to improve our products Yes No Gemini 1.5 Flash Try it in Google AI \ No newline at end of file diff --git a/docstore/258e790b-19c4-46e9-9df2-bf650cdd1ae4 b/docstore/258e790b-19c4-46e9-9df2-bf650cdd1ae4 new file mode 100644 index 0000000000000000000000000000000000000000..5517c0bf6b0252fb1080acbdd22b2b409d2663d8 --- /dev/null +++ b/docstore/258e790b-19c4-46e9-9df2-bf650cdd1ae4 @@ -0,0 +1 @@ +signatures, function call and result of the function execution to contents function_call_content = response . candidates [ 0 ] . content # Append the model's function call message, which includes thought signatures contents . append ( function_call_content ) contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Step 4: Create user friendly response with function result and call the model again // ...Create a function response part (No change) // Append thought signatures, function call and result of the function execution to contents const function_response_content = response . candidates [ 0 ]. content ; contents . push ( function_response_content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); The following shows what a request returning a thought signature may look like: [{ "contents" : [ { "role" : "user" , "parts" : [ { "text" : "what is the weather in Lake Tahoe?" } ] } , { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/259eda2c-f7bb-49b9-a30a-362c0b7452ff b/docstore/259eda2c-f7bb-49b9-a30a-362c0b7452ff new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/259eda2c-f7bb-49b9-a30a-362c0b7452ff @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/25c4630f-5c07-4a6d-9a55-3f30f3a099f3 b/docstore/25c4630f-5c07-4a6d-9a55-3f30f3a099f3 new file mode 100644 index 0000000000000000000000000000000000000000..a8da0e92acb39cd3b0a95b76d425835072520bc1 --- /dev/null +++ b/docstore/25c4630f-5c07-4a6d-9a55-3f30f3a099f3 @@ -0,0 +1 @@ +Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/docstore/25d5357b-f017-4928-84f2-991628c15f5b b/docstore/25d5357b-f017-4928-84f2-991628c15f5b new file mode 100644 index 0000000000000000000000000000000000000000..6adba52776c85866a9f7e3e3060c0512ed2f447c --- /dev/null +++ b/docstore/25d5357b-f017-4928-84f2-991628c15f5b @@ -0,0 +1 @@ +about image prompting, see the Imagen prompt guide To learn about video prompting, see the Veo prompt guide Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-04-28 UTC. \ No newline at end of file diff --git a/docstore/25e0fe5b-7a60-4ac1-807a-44505a7d774e b/docstore/25e0fe5b-7a60-4ac1-807a-44505a7d774e new file mode 100644 index 0000000000000000000000000000000000000000..5389b5d9d1b7115f0b4483b4e1da807b20a5cfd5 --- /dev/null +++ b/docstore/25e0fe5b-7a60-4ac1-807a-44505a7d774e @@ -0,0 +1 @@ +such as object . properties (object): Lists individual parameters, each with: type (string): The data type of the parameter, such as string , integer , boolean, array . description (string): A description of the parameter's purpose and format. Provide examples and constraints ("The city and state, e.g., 'San Francisco, CA' or a zip code e.g., '95616'."). enum (array, optional): If the parameter values are from a fixed set, use "enum" to list the allowed values instead of just describing them in the description. This improves accuracy ("enum": ["daylight", "cool", "warm"]). required (array): An array of strings listing the parameter names that are mandatory for the function to operate. Function calling with thinking Enabling "thinking" can improve function call performance by allowing the model to reason through a request before suggesting function calls. However, because the Gemini API is stateless, this reasoning context is lost between turns, which can reduce the quality of function calls as they require multiple turn requests. To preserve this context you can use thought signatures. A thought signature is an encrypted representation of the model's internal thought process that you pass back to the model on subsequent turns. To use thought signatures: Receive the signature: When thinking is enabled, the API response will include a thought_signature field containing an encrypted representation of the model's reasoning. Return the signature: When you send the function's execution result back to the server, include the thought_signature you received. This allows the model to restore its previous thinking context and will likely result in better function calling performance. Receiving signatures from the server Signatures are returned in the part after the model's thinking phase, which typically is a text or function call. Here are some examples of what thought signatures look like returned in each type of part, in response to the request "What's the weather in Lake \ No newline at end of file diff --git a/docstore/25f6209a-35ce-43d0-8fa2-cbea468f8ca2 b/docstore/25f6209a-35ce-43d0-8fa2-cbea468f8ca2 new file mode 100644 index 0000000000000000000000000000000000000000..9b0480f81615786d4da1611ac72ab96b5af43602 --- /dev/null +++ b/docstore/25f6209a-35ce-43d0-8fa2-cbea468f8ca2 @@ -0,0 +1 @@ +"CiIBVKhc7oDPpCaXyJKKssjqr4g3JNOSgJ/M2V+1THC1icsWCmwBVKhc7pBABbZ+zR3e9234WnWWS6GFXmf8IVwpnzjd5KYd7vyJbn/4vTorWBGayj/vbd9JPaZQjxdAIXhoE5mX/MDsQ7M9N/b0qJjHm39tYIBvS4sIWkMDHqTJqXGLzhhKtrTkfbV3RbaJEkQKmwEBVKhc7qVUgC3hfTXZLo9R3AJzUUIx50NKvJTb9B+UU+LBqgg7Nck1x5OpjWVS2R+SsveprIuYOruk2Y0H53J2OJF8qsxTdIq2si8DGW2V7WK8xyoJH5kbqd7drIw1jLb44b6lx4SMyB0VaULuTBki4d+Ljjg1tJTwR0IYMKqDLDZt9mheINsi0ZxcNjfpnDydRXdWbcSwzmK/wgqJAQFUqFzuKgNVElxs3cbO+xebr2IwcOro84nKTisi0tTp9bICPC9fTUhn3L+rvQWA+d3J1Za8at2bakrqiRj7BTh+CVO9fWQMAEQAs3ni0Z2hfaYG92tOD26E4IoZwyYEoWbfNudpH1fr5tEkyqnEGtWIh7H+XoZQ2DXeiOa+br7Zk88SrNE+trJMCogBAVSoXO5e9fBLg7hnbkmKsrzNLnQtLsQm1gNzjcjEC7nJYklYPp0KI2uGBE1PkM8XNsfllAfHVn7LzHcHNlbQ9pJ7QZTSIeG42goS971r5wNZwxaXwCTphClQh826eqJWo6A/28TtAVQWLhTx5ekbP7qb4nh1UblESZ1saxDQAEo4OKPbDzx5BgqKAQFUqFzuVyjNm5i0wN8hTDnKjfpDroEpPPTs531iFy9BOX+xDCdGHy8D+osFpaoBq6TFekQQbz4hIoUR1YEcP4zI80/cNimEeb9IcFxZTTxiNrbhbbcv0969DSMWhB+ZEqIz4vuw4GLe/xcUvqhlChQwFdgIbdOQHSHpatn5uDlktnP/bi26nKuXIwo0AVSoXO7US22OUH7d1f4abNPI0IyAvhqkPp12rbtWLx9vkOtojE8IP+xCfYtIFuZIzRNZqA==" } ] , "role" : "model" } , { "role" : "user" , "parts" : [ { "functionResponse" : { "name" : "getWeather" , "response" : { "response" : { "stringValue" : "Sunny and hot. 90 degrees Fahrenheit" } } } } ] } ] , # Remainder of request... Learn more about limitations and usage of thought signatures, and about thinking models in general, on the Thinking page. Parallel function calling In addition to single turn function calling, you can also call multiple functions at once. Parallel function calling lets you execute multiple functions at once and is used when the functions are not dependent on each other. This is useful in scenarios like gathering data from multiple independent sources, such as retrieving customer details from different databases or checking inventory levels across various warehouses or performing multiple actions such as converting your apartment into a disco. Python power_disco_ball = { "name" : "power_disco_ball" , "description" \ No newline at end of file diff --git a/docstore/25f74300-f6d5-4bb9-88dd-06b721af4ca3 b/docstore/25f74300-f6d5-4bb9-88dd-06b721af4ca3 new file mode 100644 index 0000000000000000000000000000000000000000..ef7f15fc424a675301db24205f149dd75b0faa06 --- /dev/null +++ b/docstore/25f74300-f6d5-4bb9-88dd-06b721af4ca3 @@ -0,0 +1 @@ +Ephemeral tokens | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Ephemeral tokens Ephemeral tokens are short-lived authentication tokens for accessing the Gemini API through WebSockets . They are designed to enhance security when you are connecting directly from a user's device to the API (a client-to-server implementation). Like standard API keys, ephemeral tokens can be extracted from client-side applications such as web browsers or mobile apps. But because ephemeral tokens expire quickly and can be restricted, they significantly reduce the security risks in a production environment. Note: Ephemeral tokens are only compatible with Live API at this time. You should use them when accessing the Live API directly from client-side applications to enhance API key security. How ephemeral tokens work Here's how ephemeral tokens work at a high level: Your client (e.g. web app) authenticates with your backend. Your backend requests an ephemeral token from Gemini API's provisioning service. Gemini API issues a short-lived token. Your backend sends the token to the client for WebSocket connections to Live API. You can do this by swapping your API key with an ephemeral token. The client then uses the token as if it were an API key. This enhances security because even if extracted, the token is short-lived, unlike a long-lived API key deployed client-side. Since the client sends data directly to Gemini, this also improves latency and avoids your backends needing to proxy the real time data. Create an ephemeral token Here is a simplified example of how to get an ephemeral token from Gemini. By default, you'll have 1 minute to start new Live API \ No newline at end of file diff --git a/docstore/2608d5a0-f8e2-4b90-a100-ba1e119937eb b/docstore/2608d5a0-f8e2-4b90-a100-ba1e119937eb new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/2608d5a0-f8e2-4b90-a100-ba1e119937eb @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/260c044a-53c8-4651-b003-3d019df0f524 b/docstore/260c044a-53c8-4651-b003-3d019df0f524 new file mode 100644 index 0000000000000000000000000000000000000000..f1254740842b565388da4397f822e545092a5b88 --- /dev/null +++ b/docstore/260c044a-53c8-4651-b003-3d019df0f524 @@ -0,0 +1 @@ +NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) stream := chat . SendMessageStream ( ctx , genai . Part { Text : "How many paws are in my house?" }) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt = sse \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Supported models All models in the Gemini family support text generation. To learn more about the models \ No newline at end of file diff --git a/docstore/2661fc3b-39ce-4cfc-b005-2cc42b48fa27 b/docstore/2661fc3b-39ce-4cfc-b005-2cc42b48fa27 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/2661fc3b-39ce-4cfc-b005-2cc42b48fa27 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/26c51b6f-cc37-4a2f-8422-f34f959c877c b/docstore/26c51b6f-cc37-4a2f-8422-f34f959c877c new file mode 100644 index 0000000000000000000000000000000000000000..007f635c2d64ef42082bdbd7a31da59c854333a3 --- /dev/null +++ b/docstore/26c51b6f-cc37-4a2f-8422-f34f959c877c @@ -0,0 +1 @@ +SILENT } ) JavaScript import { GoogleGenAI , Modality , Behavior , FunctionResponseScheduling } from '@google/genai' ; // for a non-blocking function definition, apply scheduling in the function response: const functionResponse = { id : fc . id , name : fc . name , response : { result : "ok" , scheduling : FunctionResponseScheduling . INTERRUPT // Can also be WHEN_IDLE or SILENT } } Code execution You can define code execution as part of the session configuration. This lets the Live API generate and execute Python code and dynamically perform computations to benefit your results. See the Code execution tutorial to learn more. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" tools = [{ 'code_execution' : {}}] config = { "response_modalities" : [ "TEXT" ], "tools" : tools } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : prompt = "Compute the largest prime palindrome under 100000." await session . send_client_content ( turns = { "parts" : [{ "text" : prompt }]}) async for chunk in session . receive (): if chunk . server_content : if chunk . text is not None : print ( chunk . text ) model_turn = chunk . server_content . model_turn if model_turn : for part in model_turn . parts : if part . executable_code is not None : print ( part . executable_code . code ) if part . code_execution_result is not None : print ( part . code_execution_result . output ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const tools = [{ codeExecution : {}}] const config = { responseModalities : [ Modality . TEXT ], tools : tools } async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = \ No newline at end of file diff --git a/docstore/26cc8395-a368-4715-af27-cbb2811574fc b/docstore/26cc8395-a368-4715-af27-cbb2811574fc new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/26cc8395-a368-4715-af27-cbb2811574fc @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/26cdc70b-17a6-4cda-b761-6c7954f4b99e b/docstore/26cdc70b-17a6-4cda-b761-6c7954f4b99e new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/26cdc70b-17a6-4cda-b761-6c7954f4b99e @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/26d833f8-a904-47c0-91ce-a56414007260 b/docstore/26d833f8-a904-47c0-91ce-a56414007260 new file mode 100644 index 0000000000000000000000000000000000000000..81bd025c83c281ffc71cf5faff30eb0cffa79ae1 --- /dev/null +++ b/docstore/26d833f8-a904-47c0-91ce-a56414007260 @@ -0,0 +1 @@ +Dialog 1 25,000 50 Tier 2 Name Concurrent sessions TPM RPD Gemini 2.5 Flash Live 1000 10,000,000 -- Gemini 2.0 Flash Live 1000 10,000,000 -- Gemini 2.5 Flash Preview Native Audio Dialog 100 1,000,000 Unlimited Gemini 2.5 Flash Experimental Native Audio Thinking Dialog -- -- -- Tier 3 Name Concurrent sessions TPM RPD Gemini 2.5 Flash Live 1000 10,000,000 -- Gemini 2.0 Flash Live 1000 10,000,000 -- Gemini 2.5 Flash Preview Native Audio Dialog -- -- -- Gemini 2.5 Flash Experimental Native Audio Thinking Dialog -- -- -- Specified rate limits are not guaranteed and actual capacity may vary. Batch Mode rate limits Batch Mode requests are subject to their own rate limits, separate from the non-batch mode API calls. Concurrent batch requests: 100 Input file size limit: 2GB File storage limit: 20GB Enqueued tokens per model: The following table outlines the maximum number of tokens that can be enqueued for batch processing across all your active batch jobs for a given model. Tier 1 Model Enqueued Tokens Limit Gemini 2.5 Pro 5,000,000 Gemini 2.5 Flash 3,000,000 Gemini 2.0 Flash 10,000,000 Gemini 2.0 Flash-Lite 10,000,000 Tier 2 Model Enqueued Tokens Limit Gemini 2.5 Pro 500,000,000 Gemini 2.5 Flash 400,000,000 Gemini 2.0 Flash 1,000,000,000 Gemini 2.0 Flash-Lite 1,000,000,000 Tier 3 Model Enqueued Tokens Limit Gemini 2.5 Pro 1,000,000,000 Gemini 2.5 Flash 1,000,000,000 Gemini 2.0 Flash 5,000,000,000 Gemini 2.0 Flash-Lite 5,000,000,000 Specified rate limits are not guaranteed and actual capacity may vary. How to upgrade to the next tier The Gemini API uses Cloud Billing for all billing services. To transition from the Free tier to a paid tier, you must first enable Cloud Billing for your Google Cloud project. Once your project meets the specified criteria, it becomes eligible for an upgrade to the next tier. To request an upgrade, follow these steps: Navigate to the API keys page in AI Studio. Locate the project you want to upgrade and click "Upgrade". The "Upgrade" option \ No newline at end of file diff --git a/docstore/26e21b86-9b85-46b8-bd81-00efdd00e640 b/docstore/26e21b86-9b85-46b8-bd81-00efdd00e640 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/26e21b86-9b85-46b8-bd81-00efdd00e640 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/26f8bf36-6d2e-4684-9a87-25bf1feb57af b/docstore/26f8bf36-6d2e-4684-9a87-25bf1feb57af new file mode 100644 index 0000000000000000000000000000000000000000..63d9f8f1c5d6f296f9334ea760bebfcc6dc4a24c --- /dev/null +++ b/docstore/26f8bf36-6d2e-4684-9a87-25bf1feb57af @@ -0,0 +1 @@ +Gemini thinking | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini thinking The Gemini 2.5 series models use an internal "thinking process" that significantly improves their reasoning and multi-step planning abilities, making them highly effective for complex tasks such as coding, advanced mathematics, and data analysis. This guide shows you how to work with Gemini's thinking capabilities using the Gemini API. Before you begin Ensure you use a supported 2.5 series model for thinking. You might find it beneficial to explore these models in AI Studio before diving into the API: Try Gemini 2.5 Flash in AI Studio Try Gemini 2.5 Pro in AI Studio Try Gemini 2.5 Flash-Lite Preview in AI Studio Generating content with thinking Initiating a request with a thinking model is similar to any other content generation request. The key difference lies in specifying one of the models with thinking support in the model field, as demonstrated in the following text generation example: Python from google import genai client = genai . Client () prompt = "Explain the concept of Occam's Razor and provide a simple, everyday example." response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = prompt ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const prompt = "Explain the concept of Occam's Razor and provide a simple, everyday example." ; const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : prompt , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" \ No newline at end of file diff --git a/docstore/27016b61-b79b-4664-bcad-c36847e0cf03 b/docstore/27016b61-b79b-4664-bcad-c36847e0cf03 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/27016b61-b79b-4664-bcad-c36847e0cf03 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/270655a2-a9ed-4e17-9c05-95146dfe20ec b/docstore/270655a2-a9ed-4e17-9c05-95146dfe20ec new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/270655a2-a9ed-4e17-9c05-95146dfe20ec @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/273e5cac-50ec-42a2-95d9-2a826810113e b/docstore/273e5cac-50ec-42a2-95d9-2a826810113e new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/273e5cac-50ec-42a2-95d9-2a826810113e @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/2775e652-6a95-41a8-b2e7-6efd4326bb46 b/docstore/2775e652-6a95-41a8-b2e7-6efd4326bb46 new file mode 100644 index 0000000000000000000000000000000000000000..5a699f69f6d156fe48df2cdc0cce67743224fcfa --- /dev/null +++ b/docstore/2775e652-6a95-41a8-b2e7-6efd4326bb46 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/function-calling#step-4 Title: Function calling with the Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/27795f68-01aa-4e92-9fd0-2f17ac61eff2 b/docstore/27795f68-01aa-4e92-9fd0-2f17ac61eff2 new file mode 100644 index 0000000000000000000000000000000000000000..09ede1e0cd752583808e3d0b103ee180dfaf457e --- /dev/null +++ b/docstore/27795f68-01aa-4e92-9fd0-2f17ac61eff2 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting_with_media#specific-instructions Title: Files API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/2791da14-1d12-4a88-be92-f946d7936cf7 b/docstore/2791da14-1d12-4a88-be92-f946d7936cf7 new file mode 100644 index 0000000000000000000000000000000000000000..964387d039ef80cfbc57b203b39f125290e566d4 --- /dev/null +++ b/docstore/2791da14-1d12-4a88-be92-f946d7936cf7 @@ -0,0 +1 @@ +using the text below. Respond with only the text provided. Question: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Text: Color: Slowly pulsing yellow What it means: There is a network error. What to do: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. Color: Fast blinking yellow What it means: You are holding down the reset button and are factory resetting this device. What to do: If you keep holding down the reset button, after about 12 seconds, the light will turn solid yellow. Once it is solid yellow, let go of the factory reset button. Color: Solid yellow What it means: Router is factory resetting. What to do: This can take up to 10 minutes. When it's done, the device will reset itself and start pulsing white, letting you know it's ready for setup. Color: Solid red What it means: Something is wrong. What to do: Critical failure. Factory reset the router. If the light stays red, contact Wifi customer support. Response: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. (gemini-2.5-flash) Add prefixes A prefix is a word or phrase that you add to the prompt content that can serve several purposes, depending on where you put the prefix: Input prefix: Adding a prefix to the input signals semantically meaningful parts of the input to the model. For example, the prefixes "English:" and "French:" demarcate two different languages. Output prefix: Even though the output is generated by the model, you can add a prefix for the output in the prompt. The output prefix gives the model information about what's expected as a response. For example, the output prefix "JSON:" signals to the model that the output should be in JSON format. Example prefix: In few-shot prompts, adding prefixes \ No newline at end of file diff --git a/docstore/2798dc0d-6f2a-44e1-902e-a8183ef4383b b/docstore/2798dc0d-6f2a-44e1-902e-a8183ef4383b new file mode 100644 index 0000000000000000000000000000000000000000..79af4391feda972140ebff71bc9d49f207aa048c --- /dev/null +++ b/docstore/2798dc0d-6f2a-44e1-902e-a8183ef4383b @@ -0,0 +1 @@ +Long context | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Long context Many Gemini models come with large context windows of 1 million or more tokens. Historically, large language models (LLMs) were significantly limited by the amount of text (or tokens) that could be passed to the model at one time. The Gemini long context window unlocks many new use cases and developer paradigms. The code you already use for cases like text generation or multimodal inputs will work without any changes with long context. This document gives you an overview of what you can achieve using models with context windows of 1M and more tokens. The page gives a brief overview of a context window, and explores how developers should think about long context, various real world use cases for long context, and ways to optimize the usage of long context. For the context window sizes of specific models, see the Models page. What is a context window? The basic way you use the Gemini models is by passing information (context) to the model, which will subsequently generate a response. An analogy for the context window is short term memory. There is a limited amount of information that can be stored in someone's short term memory, and the same is true for generative models. You can read more about how models work under the hood in our generative models guide . Getting started with long context Earlier versions of generative models were only able to process 8,000 tokens at a time. Newer models pushed this further by accepting 32,000 or even 128,000 tokens. Gemini is the first model capable of accepting 1 million tokens. In practice, 1 million tokens would look like: \ No newline at end of file diff --git a/docstore/27a46ab6-9f80-44c6-9798-c8ed92f7b52a b/docstore/27a46ab6-9f80-44c6-9798-c8ed92f7b52a new file mode 100644 index 0000000000000000000000000000000000000000..10c56dda4e771cbe191acbd7eaea4d6ff44484f5 --- /dev/null +++ b/docstore/27a46ab6-9f80-44c6-9798-c8ed92f7b52a @@ -0,0 +1 @@ +which you can get in Google AI Studio . base_url="https://generativelanguage.googleapis.com/v1beta/openai/" : This tells the OpenAI library to send requests to the Gemini API endpoint instead of the default URL. model="gemini-2.0-flash" : Choose a compatible Gemini model Thinking Gemini 2.5 models are trained to think through complex problems, leading to significantly improved reasoning. The Gemini API comes with a "thinking budget" parameter which gives fine grain control over how much the model will think. Unlike the Gemini API, the OpenAI API offers three levels of thinking control: "low" , "medium" , and "high" , which map to 1,024, 8,192, and 24,576 tokens, respectively. If you want to disable thinking, you can set reasoning_effort to "none" (note that reasoning cannot be turned off for 2.5 Pro models). Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.5-flash" , reasoning_effort = "low" , messages = [ { "role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Explain to me how AI works" } ] ) print ( response . choices [ 0 ] . message ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); const response = await openai . chat . completions . create ({ model : "gemini-2.5-flash" , reasoning_effort : "low" , messages : [ { role : "system" , content : "You are a helpful assistant." }, { role : "user" , content : "Explain to me how AI works" , }, ], }); console . log ( response . choices [ 0 ]. message ); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.5-flash", "reasoning_effort": "low", \ No newline at end of file diff --git a/docstore/27ac5050-2029-4ed4-b7e4-b75e6592922f b/docstore/27ac5050-2029-4ed4-b7e4-b75e6592922f new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/27ac5050-2029-4ed4-b7e4-b75e6592922f @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/27caa893-fa4f-4d73-b2a6-bb1202629dc4 b/docstore/27caa893-fa4f-4d73-b2a6-bb1202629dc4 new file mode 100644 index 0000000000000000000000000000000000000000..54ff3139001cad531cb76ca5ae25b2688a321ffa --- /dev/null +++ b/docstore/27caa893-fa4f-4d73-b2a6-bb1202629dc4 @@ -0,0 +1 @@ +angle," "worms eye," "dolly shot," "zoom shot," "pan shot," and "tracking shot." Focus and lens effects: Use terms like "shallow focus," "deep focus," "soft focus," "macro lens," and "wide-angle lens" to achieve specific visual effects. Overall style and subject: Guide Veo's creative direction by specifying styles like "sci-fi," "romantic comedy," "action movie," or "animation." You can also describe the subjects and backgrounds you want, such as "cityscape," "nature," "vehicles," or "animals." Veo prompt guide This section of the Veo guide contains examples of videos you can create using Veo, and shows you how to modify prompts to produce distinct results. Safety filters Veo applies safety filters across Gemini to help ensure that generated videos and uploaded photos don't contain offensive content. Prompts that violate our terms and guidelines are blocked. Prompt writing basics Good prompts are descriptive and clear. To get your generated video as close as possible to what you want, start with identifying your core idea, and then refine your idea by adding keywords and modifiers. The following elements should be included in your prompt: Subject : The object, person, animal, or scenery that you want in your video. Context : The background or context in which the subject is placed. Action : What the subject is doing (for example, walking , running , or turning their head ). Style : This can be general or very specific. Consider using specific film style keywords, such as horror film , film noir , or animated styles like cartoon style. Camera motion : [Optional] What the camera is doing, such as aerial view , eye-level , top-down shot , or low-angle shot . Composition : [Optional] How the shot is framed, such as wide shot , close-up , or extreme close-up . Ambiance : [Optional] How the color and light contribute to the scene, such as blue tones , night , or warm tones . More tips for writing prompts The following tips help you write prompts that generate your videos: \ No newline at end of file diff --git a/docstore/27f97d20-c3dc-44f6-a44d-82eaff561dc2 b/docstore/27f97d20-c3dc-44f6-a44d-82eaff561dc2 new file mode 100644 index 0000000000000000000000000000000000000000..34a11694c4ec18846b4474603e1c63c04851d790 --- /dev/null +++ b/docstore/27f97d20-c3dc-44f6-a44d-82eaff561dc2 @@ -0,0 +1 @@ +function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'When did the last Brazil vs. Argentina soccer match happen?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . modelTurn && turn . serverContent . modelTurn . parts ) { for ( const part of turn . serverContent . modelTurn . parts ) { if ( part . text ) { console . debug ( 'Received text: %s\n' , part . text ); } else if ( part . executableCode ) { console . debug ( 'executableCode: %s\n' , part . executableCode . code ); } else if ( part . codeExecutionResult ) { console . debug ( 'codeExecutionResult: %s\n' , part . codeExecutionResult . output ); } } } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Combining multiple tools You can combine multiple tools within the Live API, increasing your application's capabilities even more: Python prompt = """ Hey, I need you to do three things for me. 1. Compute the largest prime palindrome under 100000. 2. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024? 3. Turn on the lights Thanks! """ tools = [ { "google_search" : {}}, { "code_execution" : {}}, { "function_declarations" : [ turn_on_the_lights , turn_off_the_lights ]}, ] config = { "response_modalities" : [ "TEXT" ], "tools" : tools } # ... remaining model call JavaScript const prompt = `Hey, I need you to do three things for me. 1. Compute the largest prime palindrome under 100000. 2. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024? 3. Turn on the lights Thanks! ` const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ \ No newline at end of file diff --git a/docstore/27fc54cc-1798-421e-a4c0-3886f4cb216a b/docstore/27fc54cc-1798-421e-a4c0-3886f4cb216a new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/27fc54cc-1798-421e-a4c0-3886f4cb216a @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/283001d4-874c-421a-940c-b0a409a37784 b/docstore/283001d4-874c-421a-940c-b0a409a37784 new file mode 100644 index 0000000000000000000000000000000000000000..461fb5984a10eba311dd4e49e05c6506b1b581d9 --- /dev/null +++ b/docstore/283001d4-874c-421a-940c-b0a409a37784 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/thinking Title: Gemini thinking | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/28349b3f-7c3a-4c12-ae4a-09853b2647f2 b/docstore/28349b3f-7c3a-4c12-ae4a-09853b2647f2 new file mode 100644 index 0000000000000000000000000000000000000000..8ae6e7f8faf81825a858a0fb6935f6e2a3e8dc09 --- /dev/null +++ b/docstore/28349b3f-7c3a-4c12-ae4a-09853b2647f2 @@ -0,0 +1 @@ +help with that, as I'm only a language model." If the model responds with a fallback response, try increasing the temperature. Things to avoid Avoid relying on models to generate factual information. Use with care on math and logic problems. Generative models under the hood This section aims to answer the question - Is there randomness in generative models' responses, or are they deterministic? The short answer - yes to both. When you prompt a generative model, a text response is generated in two stages. In the first stage, the generative model processes the input prompt and generates a probability distribution over possible tokens (words) that are likely to come next. For example, if you prompt with the input text "The dog jumped over the ... ", the generative model will produce an array of probable next words: [("fence", 0.77), ("ledge", 0.12), ("blanket", 0.03), ...] This process is deterministic; a generative model will produce this same distribution every time it's input the same prompt text. In the second stage, the generative model converts these distributions into actual text responses through one of several decoding strategies. A simple decoding strategy might select the most likely token at every timestep. This process would always be deterministic. However, you could instead choose to generate a response by randomly sampling over the distribution returned by the model. This process would be stochastic (random). Control the degree of randomness allowed in this decoding process by setting the temperature. A temperature of 0 means only the most likely tokens are selected, and there's no randomness. Conversely, a high temperature injects a high degree of randomness into the tokens selected by the model, leading to more unexpected, surprising model responses. Next steps Now that you have a deeper understanding of prompt design, try writing your own prompts using Google AI Studio . To learn about multimodal prompting, see Prompting with media files . To learn \ No newline at end of file diff --git a/docstore/2836684a-6791-4730-9ec3-581a33ed8bfd b/docstore/2836684a-6791-4730-9ec3-581a33ed8bfd new file mode 100644 index 0000000000000000000000000000000000000000..6adba52776c85866a9f7e3e3060c0512ed2f447c --- /dev/null +++ b/docstore/2836684a-6791-4730-9ec3-581a33ed8bfd @@ -0,0 +1 @@ +about image prompting, see the Imagen prompt guide To learn about video prompting, see the Veo prompt guide Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-04-28 UTC. \ No newline at end of file diff --git a/docstore/283689bc-46c0-4a46-8491-438004f5fdf4 b/docstore/283689bc-46c0-4a46-8491-438004f5fdf4 new file mode 100644 index 0000000000000000000000000000000000000000..76ace95fe58e8b11b8d859b0314962fe72bedd09 --- /dev/null +++ b/docstore/283689bc-46c0-4a46-8491-438004f5fdf4 @@ -0,0 +1 @@ +() { const payload = { contents : [ { parts : [ { text : 'Explain how AI works' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Multi-turn conversations (Chat) Our SDKs provide functionality to collect multiple rounds of prompts and responses into a chat, giving you an easy way to keep track of the conversation history. Note: Chat functionality is only implemented as part of the SDKs. Behind the scenes, it still uses the generateContent API. For multi-turn conversations, the full conversation history is sent to the model with each follow-up turn. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message ( "I have 2 dogs in my house." ) print ( response . text ) response = chat . send_message ( "How many paws are in my house?" ) print ( response . text ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const response1 = await chat . sendMessage ({ message : "I have 2 dogs in my house." , }); console . log ( "Chat response 1:" , response1 . text ); const response2 = await chat . sendMessage ({ message : "How many paws are in \ No newline at end of file diff --git a/docstore/2837bdf4-1598-497e-b48f-57462caaadcc b/docstore/2837bdf4-1598-497e-b48f-57462caaadcc new file mode 100644 index 0000000000000000000000000000000000000000..1a5a15d1a7f161c94f9ce1141985bfe4bd599916 --- /dev/null +++ b/docstore/2837bdf4-1598-497e-b48f-57462caaadcc @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/audio#upload-audio Title: Audio understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/284698bc-2bdf-43bc-b1fd-cd3479a00698 b/docstore/284698bc-2bdf-43bc-b1fd-cd3479a00698 new file mode 100644 index 0000000000000000000000000000000000000000..1b38806cd93f51329872622740eee5221d7a7d7e --- /dev/null +++ b/docstore/284698bc-2bdf-43bc-b1fd-cd3479a00698 @@ -0,0 +1 @@ +18°C." , config = config , ) # Print the final, user-facing response print ( response . text ) Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . I 've set the thermostat to 20°C. JavaScript This example shows how to use JavaScript/TypeScript SDK to do comopositional function calling using a manual execution loop. import { GoogleGenAI , Type } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Example Functions function get_weather_forecast ({ location }) { console . log ( `Tool Call: get_weather_forecast(location= ${ location } )` ); // TODO: Make API call console . log ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ); return { temperature : 25 , unit : "celsius" }; } function set_thermostat_temperature ({ temperature }) { console . log ( `Tool Call: set_thermostat_temperature(temperature= ${ temperature } )` , ); // TODO: Make API call console . log ( "Tool Response: {'status': 'success'}" ); return { status : "success" }; } const toolFunctions = { get_weather_forecast , set_thermostat_temperature , }; const tools = [ { functionDeclarations : [ { name : "get_weather_forecast" , description : "Gets the current weather temperature for a given location." , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , }, }, required : [ "location" ], }, }, { name : "set_thermostat_temperature" , description : "Sets the thermostat to a desired temperature." , parameters : { type : Type . OBJECT , properties : { temperature : { type : Type . NUMBER , }, }, required : [ \ No newline at end of file diff --git a/docstore/284a1580-cd50-40c6-a870-e4f68c5f9865 b/docstore/284a1580-cd50-40c6-a870-e4f68c5f9865 new file mode 100644 index 0000000000000000000000000000000000000000..a8da0e92acb39cd3b0a95b76d425835072520bc1 --- /dev/null +++ b/docstore/284a1580-cd50-40c6-a870-e4f68c5f9865 @@ -0,0 +1 @@ +Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/docstore/284df76c-7d8c-44a1-a02e-f360c1520d49 b/docstore/284df76c-7d8c-44a1-a02e-f360c1520d49 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/284df76c-7d8c-44a1-a02e-f360c1520d49 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/2861fa05-efeb-4bc3-bf65-705d5b83d0d7 b/docstore/2861fa05-efeb-4bc3-bf65-705d5b83d0d7 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/2861fa05-efeb-4bc3-bf65-705d5b83d0d7 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/2873c500-4bff-4cca-bf4d-934ee9a02da6 b/docstore/2873c500-4bff-4cca-bf4d-934ee9a02da6 new file mode 100644 index 0000000000000000000000000000000000000000..0d39ec227c9e73c3d3b5117917b80f593b500b4b --- /dev/null +++ b/docstore/2873c500-4bff-4cca-bf4d-934ee9a02da6 @@ -0,0 +1 @@ +async function main () { const ai = new GoogleGenAI ({}); const imageUrl = "https://goo.gle/instrument-img" ; const response = await fetch ( imageUrl ); const imageArrayBuffer = await response . arrayBuffer (); const base64ImageData = Buffer . from ( imageArrayBuffer ). toString ( 'base64' ); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ { inlineData : { mimeType : 'image/jpeg' , data : base64ImageData , }, }, { text : "Caption this image." } ], }); console . log ( result . text ); } main (); Go package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Download the image. imageResp , _ := http . Get ( "https://goo.gle/instrument-img" ) imageBytes , _ := io . ReadAll ( imageResp . Body ) parts := [] * genai . Part { genai . NewPartFromBytes ( imageBytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMG_URL = "https://goo.gle/instrument-img" MIME_TYPE = $( curl -sIL " $IMG_URL " | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1 ) if [[ -z " $MIME_TYPE " || ! " $MIME_TYPE " == image/* ]] ; then MIME_TYPE = "image/jpeg" fi # Check for macOS if [[ " $( uname ) " == "Darwin" ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -b 0 ) elif [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 ) else IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -w0 ) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ \ No newline at end of file diff --git a/docstore/287a3670-e915-463f-ac73-9e47771781ae b/docstore/287a3670-e915-463f-ac73-9e47771781ae new file mode 100644 index 0000000000000000000000000000000000000000..5ec9ece3755d9588328841497099189b5d8c6b8d --- /dev/null +++ b/docstore/287a3670-e915-463f-ac73-9e47771781ae @@ -0,0 +1 @@ +5. Forever Blooms Task Give me a simple list of just the things that I must bring on a camping trip. The list should have 5 items. 1. **Tent:** Your shelter from the elements. 2. **Sleeping Bag:** Stay warm and comfortable. 3. **Sleeping Pad:** Insulation and cushioning for a good night's sleep. 4. **Headlamp or Flashlight:** Light for nighttime activities. 5. **First Aid Kit:** Be prepared for minor injuries. Entity Classify the following items as [large, small]: Elephant Mouse Snail Here's the classification: * **Elephant:** Large * **Mouse:** Small * **Snail:** Small Partial input completion Generative language models work like an advanced auto completion tool. When you provide partial content, the model can provide the rest of the content or what it thinks is a continuation of that content as a response. When doing so, if you include any examples or context, the model can take those examples or context into account. The following example provides a prompt with an instruction and an entity input: Prompt: For the given order, return a JSON object that has the fields cheeseburger, hamburger, fries, or drink, with the value being the quantity. Order: A burger and a drink. Response: { "cheeseburger": 0, "hamburger": 1, "fries": 0, "drink": 1 } (gemini-2.5-flash) While the model did as prompted, writing out the instructions in natural language can sometimes be challenging and it leaves a lot to the model's interpretation. For example, a restaurants menu might contain many items. To reduce the size of the JSON response, you probably want to omit the items that weren't ordered. In this case, you can give an example and a response prefix and let the model complete it: Prompt: Valid fields are cheeseburger, hamburger, fries, and drink. Order: Give me a cheeseburger and fries Output: ``` { "cheeseburger": 1, "fries": 1 } ``` Order: I want two burgers, a drink, and fries. Output: Response: ``` { "hamburger": 2, "drink": 1, "fries": 1 } ``` (gemini-2.5-flash) Notice how \ No newline at end of file diff --git a/docstore/287d1f0e-b3f6-4b0b-9023-c5f23f923802 b/docstore/287d1f0e-b3f6-4b0b-9023-c5f23f923802 new file mode 100644 index 0000000000000000000000000000000000000000..142d44299675e72008ee076f6f6fed64a9d19949 --- /dev/null +++ b/docstore/287d1f0e-b3f6-4b0b-9023-c5f23f923802 @@ -0,0 +1 @@ +charge $0.50 (text) Output price Free of charge $10.00 (audio) Used to improve our products Yes No Gemini 2.5 Pro Preview TTS Try it in Google AI Studio Our 2.5 Pro text-to-speech audio model optimized for powerful, low-latency speech generation for more natural outputs and easier to steer prompts. Preview models may change before becoming stable and have more restrictive rate limits. Free Tier Paid Tier, per 1M tokens in USD Input price Not available $1.00 (text) Output price Not available $20.00 (audio) Used to improve our products Yes No Gemini 2.0 Flash Try it in Google AI Studio Our most balanced multimodal model with great performance across all tasks, with a 1 million token context window, and built for the era of Agents. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge $0.10 (text / image / video) $0.70 (audio) Output price Free of charge $0.40 Context caching price Free of charge $0.025 / 1,000,000 tokens (text/image/video) $0.175 / 1,000,000 tokens (audio) Context caching (storage) Free of charge, up to 1,000,000 tokens of storage per hour $1.00 / 1,000,000 tokens per hour Image generation pricing Free of charge $0.039 per image* Tuning price Not available Not available Grounding with Google Search Free of charge, up to 500 RPD 1,500 RPD (free), then $35 / 1,000 requests Live API Free of charge Input: $0.35 (text), $2.10 (audio / image [video]) Output: $1.50 (text), $8.50 (audio) Used to improve our products Yes No [*] Image output is priced at $30 per 1,000,000 tokens. Output images up to 1024x1024px consume 1290 tokens and are equivalent to $0.039 per image. Gemini 2.0 Flash-Lite Try it in Google AI Studio Our smallest and most cost effective model, built for at scale usage. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge $0.075 Output price Free of charge $0.30 Context caching price Not available Not available Context caching (storage) Not available Not available Tuning price Not available Not available Grounding \ No newline at end of file diff --git a/docstore/28a5a153-4bee-4c94-9e3e-1d2e821f6ea7 b/docstore/28a5a153-4bee-4c94-9e3e-1d2e821f6ea7 new file mode 100644 index 0000000000000000000000000000000000000000..44112a5590578bd0d81bd7d1053c465e40ca11dd --- /dev/null +++ b/docstore/28a5a153-4bee-4c94-9e3e-1d2e821f6ea7 @@ -0,0 +1 @@ +tokens for other Live API models Supported languages Live API supports the following languages. Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Language BCP-47 Code Language BCP-47 Code German (Germany) de-DE English (Australia)* en-AU English (UK)* en-GB English (India) en-IN English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Portuguese (Brazil) pt-BR Arabic (Generic) ar-XA Spanish (Spain)* es-ES French (Canada)* fr-CA Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Bengali (India) bn-IN Gujarati (India)* gu-IN Kannada (India)* kn-IN Marathi (India) mr-IN Malayalam (India)* ml-IN Tamil (India) ta-IN Telugu (India) te-IN Dutch (Netherlands) nl-NL Korean (South Korea) ko-KR Mandarin Chinese (China)* cmn-CN Polish (Poland) pl-PL Russian (Russia) ru-RU Thai (Thailand) th-TH Languages marked with an asterisk (*) are not available for Native audio . What's next Read the Tool Use and Session Management guides for essential information on using the Live API effectively. Try the Live API in Google AI Studio . For more info about the Live API models, see Gemini 2.0 Flash Live and Gemini 2.5 Flash Native Audio on the Models page. Try more examples in the Live API cookbook , the Live API Tools cookbook , and the Live API Get Started script . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/28aa5d8e-b475-45fe-a5b2-b465197709ea b/docstore/28aa5d8e-b475-45fe-a5b2-b465197709ea new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/28aa5d8e-b475-45fe-a5b2-b465197709ea @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/28bb755f-620a-4b03-b85f-6ff3a0d5b965 b/docstore/28bb755f-620a-4b03-b85f-6ff3a0d5b965 new file mode 100644 index 0000000000000000000000000000000000000000..47c113cd9bacf84a434d531b6a240f0025b74130 --- /dev/null +++ b/docstore/28bb755f-620a-4b03-b85f-6ff3a0d5b965 @@ -0,0 +1 @@ +. For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC. \ No newline at end of file diff --git a/docstore/28ce0335-bb94-420a-91dd-aae76dca15cd b/docstore/28ce0335-bb94-420a-91dd-aae76dca15cd new file mode 100644 index 0000000000000000000000000000000000000000..a3fd9d3225fb67d0660508c87d747294298e3c33 --- /dev/null +++ b/docstore/28ce0335-bb94-420a-91dd-aae76dca15cd @@ -0,0 +1 @@ +{ "uri" : "https://vertexaisearch.cloud.google.com....." , "title" : "uefa.com" }} ], "groundingSupports" : [ { "segment" : { "startIndex" : 0 , "endIndex" : 85 , "text" : "Spain won Euro 2024, defeatin..." }, "groundingChunkIndices" : [ 0 ] }, { "segment" : { "startIndex" : 86 , "endIndex" : 210 , "text" : "This victory marks Spain's..." }, "groundingChunkIndices" : [ 0 , 1 ] } ] } } ] } The Gemini API returns the following information with the groundingMetadata : webSearchQueries : Array of the search queries used. This is useful for debugging and understanding the model's reasoning process. searchEntryPoint : Contains the HTML and CSS to render the required Search Suggestions. Full usage requirements are detailed in the Terms of Service . groundingChunks : Array of objects containing the web sources ( uri and title ). groundingSupports : Array of chunks to connect model response text to the sources in groundingChunks . Each chunk links a text segment (defined by startIndex and endIndex ) to one or more groundingChunkIndices . This is the key to building inline citations. Grounding with Google Search can also be used in combination with the URL context tool to ground responses in both public web data and the specific URLs you provide. Attributing Sources with inline Citations The API returns structured citation data, giving you complete control over how you display sources in your user interface. You can use the groundingSupports and groundingChunks fields to link the model's statements directly to their sources. Here is a common pattern for processing the metadata to create a response with inline, clickable citations. Python def add_citations ( response ): text = response . text supports = response . candidates [ 0 ] . grounding_metadata . grounding_supports chunks = response . candidates [ 0 ] . grounding_metadata . grounding_chunks # Sort supports by end_index in descending order to avoid shifting issues when inserting. sorted_supports = sorted ( supports , key \ No newline at end of file diff --git a/docstore/28ece5ac-36ca-45f0-97fc-c833738f7d6a b/docstore/28ece5ac-36ca-45f0-97fc-c833738f7d6a new file mode 100644 index 0000000000000000000000000000000000000000..ba9fb868f99e81b20779165b803150afffeabaec --- /dev/null +++ b/docstore/28ece5ac-36ca-45f0-97fc-c833738f7d6a @@ -0,0 +1 @@ +Generate video using Veo | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Generate video using Veo The Gemini API provides access to Veo 2 , Google's most capable video generation model to date. Veo generates videos in a wide range of cinematic and visual styles, capturing prompt nuance to render intricate details consistently across frames. This guide will help you get started with Veo using the Gemini API. For video prompting guidance, check out the Veo prompt guide section. Note: Veo is a paid feature and will not run in the Free tier. Visit the Pricing page for more details. Before you begin Before calling the Gemini API, ensure you have your SDK of choice installed, and a Gemini API key configured and ready to use. To use Veo with the Google Gen AI SDKs, ensure that you have one of the following versions installed: Python v1.10.0 or later TypeScript and JavaScript v0.8.0 or later Go v1.0.0 or later Generate videos This section provides code examples for generating videos using text prompts and using images . Generate from text You can use the following code to generate videos with Veo: Python import time from google import genai from google.genai import types client = genai . Client () operation = client . models . generate_videos ( model = "veo-2.0-generate-001" , prompt = "Panning wide shot of a calico kitten sleeping in the sunshine" , config = types . GenerateVideosConfig ( person_generation = "dont_allow" , # "dont_allow" or "allow_adult" aspect_ratio = "16:9" , # "16:9" or "9:16" ), ) while not operation . done : time . sleep ( 20 ) operation = client . operations . get ( operation ) for n , generated_video in enumerate ( \ No newline at end of file diff --git a/docstore/28f82722-d005-4a4a-a7ce-ccc0859a42d9 b/docstore/28f82722-d005-4a4a-a7ce-ccc0859a42d9 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/28f82722-d005-4a4a-a7ce-ccc0859a42d9 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/291a6d87-0fa9-47c1-9fd4-82231d8ef3de b/docstore/291a6d87-0fa9-47c1-9fd4-82231d8ef3de new file mode 100644 index 0000000000000000000000000000000000000000..1749162b760803024b206e202dc9d02405afd1c5 --- /dev/null +++ b/docstore/291a6d87-0fa9-47c1-9fd4-82231d8ef3de @@ -0,0 +1 @@ +Google employees via an internal governance assessment and review management platform. When data is logged for abuse monitoring, it is used solely for the purpose of policy enforcement and is not used to train or fine-tune any AI/ML models. Working with You on Policy Compliance If your use of Gemini doesn't align with our policies, we may take the following steps: Get in touch: We may reach out to you through email to understand your use case and explore ways to bring your usage into compliance. Temporary usage limits: We may limit your access to the Gemini API. Temporary suspension: We may temporarily pause your access to the Gemini API. Account closure: As a last resort, and for serious violations, we may permanently close your access to the Gemini API and other Google services. Scope These policy guidelines apply to the use of the Gemini API and AI Studio. Inline Preference Voting In Google AI Studio, you might occasionally see a side-by-side comparison of two different responses to your prompt. This is part of our Inline Preference Voting system. You'll be asked to choose which response you prefer. This helps us understand which model outputs users find most helpful. Why are we doing this? We're constantly working to improve our AI models and services. Your feedback through Inline Preference Voting helps us provide, improve, and develop Google products and services and machine learning technologies, including Google's enterprise features, products and services, consistent with the Gemini API Additional Terms of Service and Privacy Policy . What data is included in Feedback? To make informed decisions about our models, we collect certain data when you participate in Inline Preference Voting: Prompts and Responses: We record all prompts and responses, including any uploaded content, in the conversation you submitted feedback about. We also record the two response options that you selected from. This helps us understand the context of your preference. Your Vote: We \ No newline at end of file diff --git a/docstore/2924c1cf-e21e-4352-97b6-62c188302c21 b/docstore/2924c1cf-e21e-4352-97b6-62c188302c21 new file mode 100644 index 0000000000000000000000000000000000000000..facd4f50718f089582ed06a7b9e2ea144ce56d9c --- /dev/null +++ b/docstore/2924c1cf-e21e-4352-97b6-62c188302c21 @@ -0,0 +1 @@ +instructions, such as: "Show bounding boxes of all green objects in this image". It also support custom labels like "label the items with the allergens they can contain". For more examples, check following notebooks in the Gemini Cookbook : 2D spatial understanding notebook Experimental 3D pointing notebook Segmentation Starting with Gemini 2.5, models not only detect items but also segment them and provide their contour masks. The model predicts a JSON list, where each item represents a segmentation mask. Each item has a bounding box (" box_2d ") in the format [y0, x0, y1, x1] with normalized coordinates between 0 and 1000, a label (" label ") that identifies the object, and finally the segmentation mask inside the bounding box, as base64 encoded png that is a probability map with values between 0 and 255. The mask needs to be resized to match the bounding box dimensions, then binarized at your confidence threshold (127 for the midpoint). Note: For better results, disable thinking by setting the thinking budget to 0. See code sample below for an example. Python from google import genai from google.genai import types from PIL import Image , ImageDraw import io import base64 import json import numpy as np import os client = genai . Client () def parse_json ( json_output : str ): # Parsing out the markdown fencing lines = json_output . splitlines () for i , line in enumerate ( lines ): if line == "```json" : json_output = " \n " . join ( lines [ i + 1 :]) # Remove everything before "```json" output = json_output . split ( "```" )[ 0 ] # Remove everything after the closing "```" break # Exit the loop once "```json" is found return json_output def extract_segmentation_masks ( image_path : str , output_dir : str = "segmentation_outputs" ): # Load and resize image im = Image . open ( image_path ) im . thumbnail ([ 1024 , 1024 ], Image . Resampling . LANCZOS ) prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks \ No newline at end of file diff --git a/docstore/2929ee21-e972-41e7-a5fa-7d0bfc845b63 b/docstore/2929ee21-e972-41e7-a5fa-7d0bfc845b63 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/2929ee21-e972-41e7-a5fa-7d0bfc845b63 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/2932a147-9ecc-4c1f-8d4a-a94304d0997f b/docstore/2932a147-9ecc-4c1f-8d4a-a94304d0997f new file mode 100644 index 0000000000000000000000000000000000000000..6acfe7ba02c229c83724a8d4179d85fae6727e02 --- /dev/null +++ b/docstore/2932a147-9ecc-4c1f-8d4a-a94304d0997f @@ -0,0 +1 @@ +images before performing its reasoning task. Prompt Model response What is in common between these images? All four images are examples of landscape photography. Updated prompt Improved response First, describe what's in each image in detail. What's in common between these images? The first image shows a person packing a suitcase. The second image shows an empty hallway in a shopping mall. The third image shows a roll of toilet paper. The fourth image shows a dark and stormy sea. The commonality between these images is that they are all examples of liminal spaces. Liminal spaces are places or situations that are in between two other places or situations. They are often characterized by a sense of uncertainty or ambiguity. The first image shows the person in between being at home and being on a trip. The second image shows the hallway in between two stores. The third image is of toilet paper, which can represent a bathroom. A bathroom is a transitional space between private and public realms. It's a place where we prepare ourselves to enter or leave the outside world. The fourth image shows the sea in between being calm and being stormy. Liminal spaces can be both exciting and unsettling. They can be a place of new beginning or a place of endings. They can be a place of reflection or a place of action. You can also try explicitly asking the model to refer to what's in the image. Updated prompt Improved response What is in common between these images? Refer to what's in the images in your response. All of the images are related to travel. The first image shows a person packing a suitcase, the second image shows an empty hallway in a shopping mall, the third image shows a stack of toilet paper, and the fourth image shows a dark and stormy sea. These images are all related to travel because they are all things that people need or use when they travel. The first image shows a person packing a suitcase, which is something that people do when they are going on a trip. The \ No newline at end of file diff --git a/docstore/293eab00-f596-44b0-b9e4-554e87afd6f7 b/docstore/293eab00-f596-44b0-b9e4-554e87afd6f7 new file mode 100644 index 0000000000000000000000000000000000000000..072145835ec7b3d432f62b18d5ac79986f628292 --- /dev/null +++ b/docstore/293eab00-f596-44b0-b9e4-554e87afd6f7 @@ -0,0 +1,3 @@ +URL: https://ai.google.dev/gemini-api/docs/model-tuning Title: Fine-tuning with the Gemini API | Google AI for Developers ================================================== + +Fine-tuning with the Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Fine-tuning with the Gemini API With the deprecation of Gemini 1.5 Flash-001 in May 2025, we no longer have a model available which supports fine-tuning in the Gemini API, but it is supported in Vertex AI . We plan to bring fine-tuning support back in the future. We would love to hear from you on our developer forum if fine-tuning is important to your use case. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-30 UTC. \ No newline at end of file diff --git a/docstore/29917da0-45a4-42fc-b33e-2fb9a2e76620 b/docstore/29917da0-45a4-42fc-b33e-2fb9a2e76620 new file mode 100644 index 0000000000000000000000000000000000000000..e37f6a6bd50ca27071d672585490917ff3d226f2 --- /dev/null +++ b/docstore/29917da0-45a4-42fc-b33e-2fb9a2e76620 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/image-understanding#segmentation Title: Image understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/299c94be-76a6-4358-98fd-c6de3f877233 b/docstore/299c94be-76a6-4358-98fd-c6de3f877233 new file mode 100644 index 0000000000000000000000000000000000000000..e6f71e62f1d38e5969349ef563bd1d1143e3d3e1 --- /dev/null +++ b/docstore/299c94be-76a6-4358-98fd-c6de3f877233 @@ -0,0 +1 @@ +shows you how to specify ambiance. Ambiance Prompt Generated output Color palettes play a vital role in photography, influencing the mood and conveying intended emotions. Try things like "muted orange warm tones," "natural light," "sunrise" or "sunset". For example, a warm, golden palette can infuse a romantic and atmospheric feel into a photograph. A close-up of a girl holding adorable golden retriever puppy in the park, sunlight. Cinematic close-up shot of a sad woman riding a bus in the rain, cool blue tones, sad mood. Use reference images to generate videos You can bring images to life by using Veo's image-to-video capability. You can use existing assets, or try Imagen to generate something new. Prompt Generated output Bunny with a chocolate candy bar. Bunny runs away. Negative prompts Negative prompts can be a powerful tool to help specify elements you don't want in the video. Describe what you want to discourage the model from generating after the phrase "Negative prompt". Follow these tips: ❌ Don't use instructive language or words like no or don't . For example, "No walls" or "don't show walls". ✅ Do describe what you don't want to see. For example, "wall, frame", which means that you don't want a wall or a frame in the video. Prompt Generated output Generate a short, stylized animation of a large, solitary oak tree with leaves blowing vigorously in a strong wind. The tree should have a slightly exaggerated, whimsical form, with dynamic, flowing branches. The leaves should display a variety of autumn colors, swirling and dancing in the wind. The animation should use a warm, inviting color palette. Generate a short, stylized animation of a large, solitary oak tree with leaves blowing vigorously in a strong wind. The tree should have a slightly exaggerated, whimsical form, with dynamic, flowing branches. The leaves should display a variety of autumn colors, swirling and dancing in the wind. The animation should use a warm, inviting color palette. With negative \ No newline at end of file diff --git a/docstore/29a38b94-c80d-4647-a544-913266089522 b/docstore/29a38b94-c80d-4647-a544-913266089522 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/29a38b94-c80d-4647-a544-913266089522 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/29ab9b81-e5c0-4785-9564-b06ce5d41b89 b/docstore/29ab9b81-e5c0-4785-9564-b06ce5d41b89 new file mode 100644 index 0000000000000000000000000000000000000000..1540f812946831d7975c774be202eaa83cf52504 --- /dev/null +++ b/docstore/29ab9b81-e5c0-4785-9564-b06ce5d41b89 @@ -0,0 +1 @@ +"CiwBVKhc7nRyTi3HmggPD9iQiRc261f5jwuMdw3H/itDH0emsb9ZVo3Nwx9p6wpsAVSoXO5i8fDV4jBSBLoaWxB5zUdlGY6aIGp+I0oEnwRRSRQ1LOvrDlojEH8JE8HjiKXALdJrvNPiG+HY3GZEO8pZjEZtc3UoBUh7+SVyjK7Xolu7aRYYeUyzrCapoETWypER1jbrJXnFV23hCosBAVSoXO6oIPNJSmbuEDfGafOhuCSHkpr1yjTp35RXYqmCESzRzWf5+nFXLqncqeFo4ohoxbiYQVpVQbOZF81p8o9zg6xeRE7qMeOv+XN7enXGJ4/s3qNFQpfkSMqRdBITN1VpX7jyfEAjvxBNc7PDfDJZmEPY338ZIY5nFFcmzJSWjVrboFt2sMFv+A==" } ] , "role" : "model" } , "finishReason" : "STOP" , "index" : 0 } ] , # Remainder of response... You can confirm that you received a signature and see what a signature looks like using the following code: # Step 2: Call the model with function declarations # ...Generation config, Configure the client, and Define user prompt (No changes) # Send request with declarations (using a thinking model) response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents ) # See thought signatures for part in response . candidates [ 0 ] . content . parts : if part . thought_signature : print ( "Thought signature:" ) print ( part . thought_signature ) Returning signatures back to the server In order to return signatures back: You should return signatures along with their containing parts back to the server You shouldn't merge a part with a signature with another part which also contains a signature. The signature string is not concatenable You shouldn't merge one part with a signature with another part without a signature. This breaks the correct positioning of the thought represented by the signature. The code will remain the same as in Step 4 of the previous section. But in this case (as indicated in the comment below) you will return signatures to the model along with the result of the function execution so the model can incorporate the thoughts into its final response: Python # Step 4: Create user friendly response with function result and call the model again # ...Create a function response part (No change) # Append thought \ No newline at end of file diff --git a/docstore/29b9a70a-81e9-4f26-a093-7e00352850b6 b/docstore/29b9a70a-81e9-4f26-a093-7e00352850b6 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/29b9a70a-81e9-4f26-a093-7e00352850b6 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/29d0ef86-936b-47d5-b573-d2d6d39b516c b/docstore/29d0ef86-936b-47d5-b573-d2d6d39b516c new file mode 100644 index 0000000000000000000000000000000000000000..90311e83465fc930174e1787a7822757d628fc0a --- /dev/null +++ b/docstore/29d0ef86-936b-47d5-b573-d2d6d39b516c @@ -0,0 +1 @@ +setLightValues ( tool_call . args . brightness , tool_call . args . color_temp ); console . log ( `Function execution result: ${ JSON . stringify ( result ) } ` ); } Step 4: Create user friendly response with function result and call the model again Finally, send the result of the function execution back to the model so it can incorporate this information into its final response to the user. Python # Create a function response part function_response_part = types . Part . from_function_response ( name = tool_call . name , response = { "result" : result }, ) # Append function call and result of the function execution to contents contents . append ( response . candidates [ 0 ] . content ) # Append the content from the model's response. contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Create a function response part const function_response_part = { name : tool_call . name , response : { result } } // Append function call and result of the function execution to contents contents . push ( response . candidates [ 0 ]. content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); // Get the final response from the model const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); This completes the function calling flow. The model successfully used the set_light_values function to perform the request action of the user. Function declarations When you implement function calling in a prompt, you create a tools object, which contains one or more function declarations . You define functions using JSON, specifically with a select subset of the OpenAPI schema format. A single function \ No newline at end of file diff --git a/docstore/29ff6d74-7f33-4c38-8ba8-f878bcac1a61 b/docstore/29ff6d74-7f33-4c38-8ba8-f878bcac1a61 new file mode 100644 index 0000000000000000000000000000000000000000..ff0e917f3633351471582782b39463f2a8e6c8ed --- /dev/null +++ b/docstore/29ff6d74-7f33-4c38-8ba8-f878bcac1a61 @@ -0,0 +1 @@ +Gemini Developer API v.s. Vertex AI | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini Developer API v.s. Vertex AI When developing generative AI solutions with Gemini, Google offers two API products: the Gemini Developer API and the Vertex AI Gemini API . The Gemini Developer API provides the fastest path to build, productionize, and scale Gemini powered applications. Most developers should use the Gemini Developer API unless there is a need for specific enterprise controls. Vertex AI offers a comprehensive ecosystem of enterprise ready features and services for building and deploying generative AI applications backed by the Google Cloud Platform. We've recently simplified migrating between these services. Both the Gemini Developer API and the Vertex AI Gemini API are now accessible through the unified Google Gen AI SDK . Code comparison This page has side-by-side code comparisons between Gemini Developer API and Vertex AI quickstarts for text generation. Python You can access both the Gemini Developer API and Vertex AI services through the google-genai library. See the libraries page for instructions on how to install google-genai . Gemini Developer API from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = "Explain how AI works in a few words" ) print ( response . text ) Vertex AI Gemini API from google import genai client = genai . Client ( vertexai = True , project = 'your-project-id' , location = 'us-central1' ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = "Explain how AI works in a few words" ) \ No newline at end of file diff --git a/docstore/2a1788e8-4fb5-48b2-ad4e-9b1559bb7679 b/docstore/2a1788e8-4fb5-48b2-ad4e-9b1559bb7679 new file mode 100644 index 0000000000000000000000000000000000000000..d028e190ded4732dff341fe647fe7532c210272b --- /dev/null +++ b/docstore/2a1788e8-4fb5-48b2-ad4e-9b1559bb7679 @@ -0,0 +1 @@ +second image shows an empty hallway in a shopping mall, which is a place where people often go to buy things for their trips. The third image shows a stack of toilet paper, which is something that people often use when they are traveling. The fourth image shows a dark and stormy sea, which is something that people often encounter when they are traveling. The same tips work for creative output from images. In the following example with the rabbit, explicitly asking the model to include aspects from the image can improve the output. Prompt Model response Can you write me a poem about this image? Little bunny, so soft and brown, Hopping through the field, up and down. Your ears so long, your tail so small, You're the cutest creature of them all. You nibble on the grass, so green and sweet, And hop away when you see me. But I'll always be your friend, Little bunny, until the very end. Updated prompt Improved response Can you write me a descriptive and dramatic poem about this image and include the location? Little bunny on the green, Ears so long, so keen. What do you see out there, In the distance, where you stare? Is it the ocean blue, Sparkling in the sun's hue? Or the waves that crash and roar, A rhythm that you adore? Little bunny, tell me true, What captures your attention so? Is it the salty breeze, Or the seagulls' cries that please? Perhaps it's the vastness of it all, A world so big and small. A place where dreams can roam, And adventures can be sown. Little bunny, I wonder what you dream, As you sit on the grass, so serene. Do you yearn to explore the deep, Or stay on land, where you can leap? Whatever it may be, little bunny, Keep that spark of wonder burning brightly. For in your dreams and aspirations, Lies a world waiting for your creation. Troubleshooting which part of the prompt failed It can be hard to know whether a prompt failed because the model didn't understand the image to begin with, or if it did understand the image but did not perform the \ No newline at end of file diff --git a/docstore/2a269cd3-effc-414b-b45b-1348466b14aa b/docstore/2a269cd3-effc-414b-b45b-1348466b14aa new file mode 100644 index 0000000000000000000000000000000000000000..2426e4316b986fa0eda84ec610a81c084b69e3a5 --- /dev/null +++ b/docstore/2a269cd3-effc-414b-b45b-1348466b14aa @@ -0,0 +1 @@ += genai . embed_content ( model = 'models/text-embedding-004' , content = 'Hello world' ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "text-embedding-004" , }); const result = await model . embedContent ( "Hello world!" ); console . log ( result . embedding ); After Python from google import genai client = genai . Client () response = client . models . embed_content ( model = 'text-embedding-004' , contents = 'Hello world' , ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const text = "Hello World!" ; const result = await ai . models . embedContent ({ model : "text-embedding-004" , contents : text , config : { outputDimensionality : 10 }, }); console . log ( result . embeddings ); Tune a Model Create and use a tuned model. The new SDK simplifies tuning with client.tunings.tune , which launches the tuning job and polls until the job is complete. Before Python import google.generativeai as genai import random # create tuning model train_data = {} for i in range ( 1 , 6 ): key = f 'input { i } ' value = f 'output { i } ' train_data [ key ] = value name = f 'generate-num- { random . randint ( 0 , 10000 ) } ' operation = genai . create_tuned_model ( source_model = 'models/gemini-1.5-flash-001-tuning' , training_data = train_data , id = name , epoch_count = 5 , batch_size = 4 , learning_rate = 0.001 , ) # wait for tuning complete tuningProgress = operation . result () # generate content with the tuned model model = genai . GenerativeModel ( model_name = f 'tunedModels/ { name } ' ) response = model . generate_content ( '55' ) After Python from google import genai from google.genai import types client = genai . Client () # Check which models are available for tuning. for m in client . models . list (): for action in m . supported_actions : if action == \ No newline at end of file diff --git a/docstore/2a3766c7-885e-493f-9512-fdf02b54a7f4 b/docstore/2a3766c7-885e-493f-9512-fdf02b54a7f4 new file mode 100644 index 0000000000000000000000000000000000000000..84742c0b7906ca5a168857eb7577a7e191bcdffb --- /dev/null +++ b/docstore/2a3766c7-885e-493f-9512-fdf02b54a7f4 @@ -0,0 +1 @@ +prompt - urban background, man-made structures, dark, stormy, or threatening atmosphere. Aspect ratios Gemini Veo video generation supports the following two aspect ratios: Aspect ratio Description Widescreen or 16:9 The most common aspect ratio for televisions, monitors, and mobile phone screens (landscape). Use this when you want to capture more of the background, like in scenic landscapes. Portrait or 9:16 Rotated widescreen. This aspect ratio has been popularized by short form video applications, such as Youtube shorts. Use this for portraits or tall objects with strong vertical orientations, such as buildings, trees, waterfall, or buildings. Widescreen This prompt is an example of the widescreen aspect ratio of 16:9. Prompt Generated output Create a video with a tracking drone view of a man driving a red convertible car in Palm Springs, 1970s, warm sunlight, long shadows. Portrait This prompt is an example of the portrait aspect ratio of 9:16. Prompt Generated output Create a video highlighting the smooth motion of a majestic Hawaiian waterfall within a lush rainforest. Focus on realistic water flow, detailed foliage, and natural lighting to convey tranquility. Capture the rushing water, misty atmosphere, and dappled sunlight filtering through the dense canopy. Use smooth, cinematic camera movements to showcase the waterfall and its surroundings. Aim for a peaceful, realistic tone, transporting the viewer to the serene beauty of the Hawaiian rainforest. What's next Gain more experience generating AI videos with the Veo Colab . Check out cool examples using Veo 2 on the Google DeepMind site Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/2a4fc27e-206c-4274-88fd-8c30b79aefc3 b/docstore/2a4fc27e-206c-4274-88fd-8c30b79aefc3 new file mode 100644 index 0000000000000000000000000000000000000000..47c113cd9bacf84a434d531b6a240f0025b74130 --- /dev/null +++ b/docstore/2a4fc27e-206c-4274-88fd-8c30b79aefc3 @@ -0,0 +1 @@ +. For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC. \ No newline at end of file diff --git a/docstore/2ab0d37a-2644-4ca9-80a4-2f290e76b360 b/docstore/2ab0d37a-2644-4ca9-80a4-2f290e76b360 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/2ab0d37a-2644-4ca9-80a4-2f290e76b360 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/2ab44905-34de-4315-bd6d-803a724c13c4 b/docstore/2ab44905-34de-4315-bd6d-803a724c13c4 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/2ab44905-34de-4315-bd6d-803a724c13c4 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/2af471aa-e9e4-4a79-b81a-b8f6dd2b7b00 b/docstore/2af471aa-e9e4-4a79-b81a-b8f6dd2b7b00 new file mode 100644 index 0000000000000000000000000000000000000000..d389eb08e8733f8ecbdee6426fdf0ee2f2a1fd57 --- /dev/null +++ b/docstore/2af471aa-e9e4-4a79-b81a-b8f6dd2b7b00 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/image-generation#choose-a-model Title: Image generation | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/2b15c77f-acc5-48c2-847a-01797e4e7f1f b/docstore/2b15c77f-acc5-48c2-847a-01797e4e7f1f new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/2b15c77f-acc5-48c2-847a-01797e4e7f1f @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/2b5c35d5-8948-4114-86a6-28f0bfd0ce50 b/docstore/2b5c35d5-8948-4114-86a6-28f0bfd0ce50 new file mode 100644 index 0000000000000000000000000000000000000000..68dfcf53eb693dba8358b7fdf6b0010fadcbc966 --- /dev/null +++ b/docstore/2b5c35d5-8948-4114-86a6-28f0bfd0ce50 @@ -0,0 +1 @@ +. files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const countTokensResponse = await ai . models . countTokens ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), ]), }); console . log ( countTokensResponse . totalTokens ); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } localAudioPath := "/path/to/sample.mp3" uploadedFile , _ := client . Files . UploadFromPath ( ctx , localAudioPath , nil , ) parts := [] * genai . Part { genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } tokens , _ := client . Models . CountTokens ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Printf ( "File %s is %d tokens\n" , localAudioPath , tokens . TotalTokens ) } Supported audio formats Gemini supports the following audio format MIME types: WAV - audio/wav MP3 - audio/mp3 AIFF - audio/aiff AAC - audio/aac OGG Vorbis - audio/ogg FLAC - audio/flac Technical details about audio Gemini represents each second of audio as 32 tokens; for example, one minute of audio is represented as 1,920 tokens. Gemini can "understand" non-speech components, such as birdsong or sirens. The maximum supported length of audio data in a single prompt is 9.5 hours. Gemini doesn't limit the number of audio files in a single prompt; however, the total combined length of all audio files in a single prompt can't exceed 9.5 hours. Gemini downsamples audio files to a 16 Kbps data resolution. If the audio source contains multiple channels, Gemini combines those channels into a single channel. What's next This guide shows how to generate text in response to audio data. To learn more, see the following resources: File prompting strategies : \ No newline at end of file diff --git a/docstore/2b621dc1-bd96-4315-a6ed-711574473383 b/docstore/2b621dc1-bd96-4315-a6ed-711574473383 new file mode 100644 index 0000000000000000000000000000000000000000..059c4a1b599ce3070a58d48cdf4339126f3d022d --- /dev/null +++ b/docstore/2b621dc1-bd96-4315-a6ed-711574473383 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/migrate Title: Migrate to the Google GenAI SDK | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/2beee2a9-e9b5-4d5a-a87a-d1bc0eb048ab b/docstore/2beee2a9-e9b5-4d5a-a87a-d1bc0eb048ab new file mode 100644 index 0000000000000000000000000000000000000000..35cb4724ff25e15eeabaea84a051d2545b73055e --- /dev/null +++ b/docstore/2beee2a9-e9b5-4d5a-a87a-d1bc0eb048ab @@ -0,0 +1 @@ +model to produce concise responses, you can include examples in the prompt that give preference to concise responses. The following prompt provides two examples that show preference to the shorter explanations. In the response, you can see that the examples guided the model to choose the shorter explanation ( Explanation2 ) as opposed to the longer explanation ( Explanation1 ) like it did previously. Prompt: Below are some examples showing a question, explanation, and answer format: Question: Why is the sky blue? Explanation1: The sky appears blue because of Rayleigh scattering, which causes shorter blue wavelengths of light to be scattered more easily than longer red wavelengths, making the sky look blue. Explanation2: Due to Rayleigh scattering effect. Answer: Explanation2 Question: What is the cause of earthquakes? Explanation1: Sudden release of energy in the Earth's crust. Explanation2: Earthquakes happen when tectonic plates suddenly slip or break apart, causing a release of energy that creates seismic waves that can shake the ground and cause damage. Answer: Explanation1 Now, Answer the following question given the example formats above: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Answer: Explanation2 (gemini-2.5-flash) Optimal number of examples Models like Gemini can often pick up on patterns using a few examples, though you may need to experiment with the number of examples to provide in the prompt for the best results. At the same time, if you include too many examples, the model may start to overfit the response to the examples. Patterns vs anti patterns Using examples to show the model a pattern to follow is more effective than using examples to show the model an anti pattern \ No newline at end of file diff --git a/docstore/2bf68a93-1e9d-4938-82e1-86c536ce7cae b/docstore/2bf68a93-1e9d-4938-82e1-86c536ce7cae new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/2bf68a93-1e9d-4938-82e1-86c536ce7cae @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/2c163174-7820-4382-aa85-728378100b4e b/docstore/2c163174-7820-4382-aa85-728378100b4e new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/2c163174-7820-4382-aa85-728378100b4e @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/2c57112f-1a22-4568-91b5-b612498f0a61 b/docstore/2c57112f-1a22-4568-91b5-b612498f0a61 new file mode 100644 index 0000000000000000000000000000000000000000..41dedb01cb0b9c984f39578d0001dc7776e6fe12 --- /dev/null +++ b/docstore/2c57112f-1a22-4568-91b5-b612498f0a61 @@ -0,0 +1 @@ +, 'IMAGE' ] ) ) for part in response . candidates [ 0 ] . content . parts : if part . text is not None : print ( part . text ) elif part . inline_data is not None : image = Image . open ( BytesIO (( part . inline_data . data ))) image . show () JavaScript Note: We've released the Google SDK for TypeScript and JavaScript in preview launch stage . Use this SDK for image generation features. import { GoogleGenAI , Modality } from "@google/genai" ; import * as fs from "node:fs" ; async function main () { const ai = new GoogleGenAI ({}); // Load the image from the local file system const imagePath = "path/to/image.png" ; const imageData = fs . readFileSync ( imagePath ); const base64Image = imageData . toString ( "base64" ); // Prepare the content parts const contents = [ { text : "Can you add a llama next to the image?" }, { inlineData : { mimeType : "image/png" , data : base64Image , }, }, ]; // Set responseModalities to include "Image" so the model can generate an image const response = await ai . models . generateContent ({ model : "gemini-2.0-flash-preview-image-generation" , contents : contents , config : { responseModalities : [ Modality . TEXT , Modality . IMAGE ], }, }); for ( const part of response . candidates [ 0 ]. content . parts ) { // Based on the part type, either show the text or save the image if ( part . text ) { console . log ( part . text ); } else if ( part . inlineData ) { const imageData = part . inlineData . data ; const buffer = Buffer . from ( imageData , "base64" ); fs . writeFileSync ( "gemini-native-image.png" , buffer ); console . log ( "Image saved as gemini-native-image.png" ); } } } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imagePath := "/path/to/image.png" imgData , _ := os . ReadFile ( imagePath ) parts := [] * genai . Part { genai . NewPartFromText ( "Hi, This is \ No newline at end of file diff --git a/docstore/2c58f681-b64b-4c6c-88c5-0aab4d0eb3ba b/docstore/2c58f681-b64b-4c6c-88c5-0aab4d0eb3ba new file mode 100644 index 0000000000000000000000000000000000000000..846f589921f766089772715bc1a3853935a191ce --- /dev/null +++ b/docstore/2c58f681-b64b-4c6c-88c5-0aab4d0eb3ba @@ -0,0 +1 @@ +batch_status.json ) if [[ $batch_state = "JOB_STATE_SUCCEEDED" ]] ; then if [[ $( jq '.response | has("inlinedResponses")' batch_status.json ) = "true" ]] ; then jq -r '.response.inlinedResponses' batch_status.json exit fi responses_file_name = $( jq -r '.response.responsesFile' batch_status.json ) curl https://generativelanguage.googleapis.com/download/v1beta/ $responses_file_name :download?alt = media \ -H "x-goog-api-key: $GEMINI_API_KEY " 2 > /dev/null elif [[ $batch_state = "JOB_STATE_FAILED" ]] ; then jq '.error' batch_status.json elif [[ $batch_state == "JOB_STATE_CANCELLED" ]] ; then echo "Batch was cancelled by the user" fi Cancelling a batch job You can cancel an ongoing batch job using its name. When a job is canceled, it stops processing new requests. Python # Cancel a batch job client . batches . cancel ( name = batch_job_to_cancel . name ) REST BATCH_NAME = "batches/123456" # Your batch job name # Cancel the batch curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME :cancel \ -H "x-goog-api-key: $GEMINI_API_KEY " \ # Confirm that the status of the batch after cancellation is JOB_STATE_CANCELLED curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type:application/json" 2 > /dev/null | jq -r '.metadata.state' Deleting a batch job You can delete an existing batch job using its name. When a job is deleted, it stops processing new requests and is removed from the list of batch jobs. Python # Delete a batch job client . batches . delete ( name = batch_job_to_delete . name ) REST BATCH_NAME = "batches/123456" # Your batch job name # Cancel the batch curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME :delete \ -H "x-goog-api-key: $GEMINI_API_KEY " \ Technical details Supported models: Batch Mode supports a range of Gemini models. Refer to the Models page for the latest list of compatible models. The supported modalities for Batch Mode are the same as what's \ No newline at end of file diff --git a/docstore/2c69c961-8f75-4542-b4a6-817fabff394d b/docstore/2c69c961-8f75-4542-b4a6-817fabff394d new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/2c69c961-8f75-4542-b4a6-817fabff394d @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/2c706611-c670-4817-8694-8ea49990d8b0 b/docstore/2c706611-c670-4817-8694-8ea49990d8b0 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/2c706611-c670-4817-8694-8ea49990d8b0 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/2c7f6a7a-30b5-4000-9f5c-a0fb033e154f b/docstore/2c7f6a7a-30b5-4000-9f5c-a0fb033e154f new file mode 100644 index 0000000000000000000000000000000000000000..83f33a76d0463ea555ab2ce44cc200efcb231e58 --- /dev/null +++ b/docstore/2c7f6a7a-30b5-4000-9f5c-a0fb033e154f @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/vision#object-detection Title: Image understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/2c808de4-e4f1-4630-9abb-a427457ba5c0 b/docstore/2c808de4-e4f1-4630-9abb-a427457ba5c0 new file mode 100644 index 0000000000000000000000000000000000000000..651124da3927c31f504e3c50b4f98f32fbef29df --- /dev/null +++ b/docstore/2c808de4-e4f1-4630-9abb-a427457ba5c0 @@ -0,0 +1 @@ +chat . send_message ( message = 'What happened after that?' ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const chat = ai . chats . create ({ model : "gemini-2.0-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const response1 = await chat . sendMessage ({ message : "I have 2 dogs in my house." , }); console . log ( "Chat response 1:" , response1 . text ); const response2 = await chat . sendMessage ({ message : "How many paws are in my house?" , }); console . log ( "Chat response 2:" , response2 . text ); Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } chat , err := client . Chats . Create ( ctx , "gemini-2.0-flash" , nil , nil ) if err != nil { log . Fatal ( err ) } result , err := chat . SendMessage ( ctx , genai . Part { Text : "Hello, I have 2 dogs in my house." }) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing result result , err = chat . SendMessage ( ctx , genai . Part { Text : "How many paws are in my house?" }) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing result Function calling Before Python import google.generativeai as genai from enum import Enum def get_current_weather ( location : str ) - > str : """Get the current whether in a given location. Args: location: required, The city and state, e.g. San Franciso, CA unit: celsius or fahrenheit """ print ( f 'Called with: { location =} ' ) return "23C" model = genai . GenerativeModel ( model_name = "gemini-1.5-flash" , tools = [ get_current_weather ] ) response = model . generate_content ( "What is the weather in San Francisco?" ) function_call = response . candidates [ 0 ] . parts [ 0 ] . function_call After Python In the new SDK, automatic function calling is the \ No newline at end of file diff --git a/docstore/2c80d5cb-e725-4899-a5c0-ae1d333a33b6 b/docstore/2c80d5cb-e725-4899-a5c0-ae1d333a33b6 new file mode 100644 index 0000000000000000000000000000000000000000..56f7dcc9bceb6e3744499a44a4d77c57b76426e8 --- /dev/null +++ b/docstore/2c80d5cb-e725-4899-a5c0-ae1d333a33b6 @@ -0,0 +1 @@ += "gemini-2.5-flash" , contents = [ "Explain how AI works" ], config = types . GenerateContentConfig ( temperature = 0.1 ) ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , config : { temperature : 0.1 , }, }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } temp := float32 ( 0.9 ) topP := float32 ( 0.5 ) topK := float32 ( 20.0 ) config := & genai . GenerateContentConfig { Temperature : & temp , TopP : & topP , TopK : & topK , ResponseMIMEType : "application/json" , } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "What is the average size of a swallow?" ), config , ) fmt . Println ( result . Text ()) } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ], "generationConfig": { "stopSequences": [ "Title" ], "temperature": 1.0, "topP": 0.8, "topK": 10 } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const generationConfig = { temperature : 1 , topP : 0.95 , topK : 40 , responseMimeType : 'text/plain' , }; const payload = { generationConfig , contents : [ { parts : [ { text : 'Explain how AI works in a few words' }, ], }, ], }; const url = \ No newline at end of file diff --git a/docstore/2c852356-8c70-422a-ba11-79485930757d b/docstore/2c852356-8c70-422a-ba11-79485930757d new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/2c852356-8c70-422a-ba11-79485930757d @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/2c9315d3-5941-414a-8e9e-bbd88bc8caf4 b/docstore/2c9315d3-5941-414a-8e9e-bbd88bc8caf4 new file mode 100644 index 0000000000000000000000000000000000000000..c085d8aece3abc99a010c5a69268bce2397f0e27 --- /dev/null +++ b/docstore/2c9315d3-5941-414a-8e9e-bbd88bc8caf4 @@ -0,0 +1 @@ +100mm Macro lens Model: imagen-3.0-generate-002 Motion Use case Lens type Focal lengths Additional details Sports, wildlife (motion) Telephoto zoom 100-400mm Fast shutter speed, Action or movement tracking Using several keywords from the table, Imagen can generate the following motion images: Prompt: a winning touchdown, fast shutter speed, movement tracking Model: imagen-3.0-generate-002 Prompt: A deer running in the forest, fast shutter speed, movement tracking Model: imagen-3.0-generate-002 Wide-angle Use case Lens type Focal lengths Additional details Astronomical, landscape (wide-angle) Wide-angle 10-24mm Long exposure times, sharp focus, long exposure, smooth water or clouds Using several keywords from the table, Imagen can generate the following wide-angle images: Prompt: an expansive mountain range, landscape wide angle 10mm Model: imagen-3.0-generate-002 Prompt: a photo of the moon, astro photography, wide angle 10mm Model: imagen-3.0-generate-002 What's next Check out the Veo guide to learn how to generate videos with the Gemini API. To learn more about Gemini models, see Gemini models and Experimental models . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/2cc9cf7a-acb0-46a4-b3d5-f1609b839508 b/docstore/2cc9cf7a-acb0-46a4-b3d5-f1609b839508 new file mode 100644 index 0000000000000000000000000000000000000000..3c4c34cae86a8c65d90dc5a5c6bf6faef058f793 --- /dev/null +++ b/docstore/2cc9cf7a-acb0-46a4-b3d5-f1609b839508 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.0-flash-preview-image-generation Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/2cf012c1-f8eb-4e3c-a21f-6c1f6a16d735 b/docstore/2cf012c1-f8eb-4e3c-a21f-6c1f6a16d735 new file mode 100644 index 0000000000000000000000000000000000000000..5f8a5e922d24af531eff4f89e4f99a5736b0820b --- /dev/null +++ b/docstore/2cf012c1-f8eb-4e3c-a21f-6c1f6a16d735 @@ -0,0 +1 @@ +(`totalTokenCount`). console . log ( generateResult . response . usageMetadata ); // candidatesTokenCount and totalTokenCount depend on response, may vary // { promptTokenCount: 11, candidatesTokenCount: 124, totalTokenCount: 135 } After Python from google import genai client = genai . Client () response = client . models . count_tokens ( model = 'gemini-2.0-flash' , contents = 'The quick brown fox jumps over the lazy dog.' , ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const prompt = "The quick brown fox jumps over the lazy dog." ; const countTokensResponse = await ai . models . countTokens ({ model : "gemini-2.0-flash" , contents : prompt , }); console . log ( countTokensResponse . totalTokens ); const generateResponse = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : prompt , }); console . log ( generateResponse . usageMetadata ); Generate images Generate images: Before Python #pip install https://github.com/google-gemini/generative-ai-python@imagen import google.generativeai as genai imagen = genai . ImageGenerationModel ( "imagen-3.0-generate-001" ) gen_images = imagen . generate_images ( prompt = "Robot holding a red skateboard" , number_of_images = 1 , safety_filter_level = "block_low_and_above" , person_generation = "allow_adult" , aspect_ratio = "3:4" , ) After Python from google import genai client = genai . Client () gen_images = client . models . generate_images ( model = 'imagen-3.0-generate-001' , prompt = 'Robot holding a red skateboard' , config = types . GenerateImagesConfig ( number_of_images = 1 , safety_filter_level = "BLOCK_LOW_AND_ABOVE" , person_generation = "ALLOW_ADULT" , aspect_ratio = "3:4" , ) ) for n , image in enumerate ( gen_images . generated_images ): pathlib . Path ( f ' { n } .png' ) . write_bytes ( image . image . image_bytes ) Embed content Generate content embeddings. Before Python import google.generativeai as genai response \ No newline at end of file diff --git a/docstore/2d024b2b-fc38-4b00-a254-959ff4d2f4e3 b/docstore/2d024b2b-fc38-4b00-a254-959ff4d2f4e3 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/2d024b2b-fc38-4b00-a254-959ff4d2f4e3 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/2d24fae5-52d3-4733-85a4-b34b1a84c764 b/docstore/2d24fae5-52d3-4733-85a4-b34b1a84c764 new file mode 100644 index 0000000000000000000000000000000000000000..142d44299675e72008ee076f6f6fed64a9d19949 --- /dev/null +++ b/docstore/2d24fae5-52d3-4733-85a4-b34b1a84c764 @@ -0,0 +1 @@ +charge $0.50 (text) Output price Free of charge $10.00 (audio) Used to improve our products Yes No Gemini 2.5 Pro Preview TTS Try it in Google AI Studio Our 2.5 Pro text-to-speech audio model optimized for powerful, low-latency speech generation for more natural outputs and easier to steer prompts. Preview models may change before becoming stable and have more restrictive rate limits. Free Tier Paid Tier, per 1M tokens in USD Input price Not available $1.00 (text) Output price Not available $20.00 (audio) Used to improve our products Yes No Gemini 2.0 Flash Try it in Google AI Studio Our most balanced multimodal model with great performance across all tasks, with a 1 million token context window, and built for the era of Agents. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge $0.10 (text / image / video) $0.70 (audio) Output price Free of charge $0.40 Context caching price Free of charge $0.025 / 1,000,000 tokens (text/image/video) $0.175 / 1,000,000 tokens (audio) Context caching (storage) Free of charge, up to 1,000,000 tokens of storage per hour $1.00 / 1,000,000 tokens per hour Image generation pricing Free of charge $0.039 per image* Tuning price Not available Not available Grounding with Google Search Free of charge, up to 500 RPD 1,500 RPD (free), then $35 / 1,000 requests Live API Free of charge Input: $0.35 (text), $2.10 (audio / image [video]) Output: $1.50 (text), $8.50 (audio) Used to improve our products Yes No [*] Image output is priced at $30 per 1,000,000 tokens. Output images up to 1024x1024px consume 1290 tokens and are equivalent to $0.039 per image. Gemini 2.0 Flash-Lite Try it in Google AI Studio Our smallest and most cost effective model, built for at scale usage. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge $0.075 Output price Free of charge $0.30 Context caching price Not available Not available Context caching (storage) Not available Not available Tuning price Not available Not available Grounding \ No newline at end of file diff --git a/docstore/2d25fff4-2181-4220-9c58-afdd57a7a0e7 b/docstore/2d25fff4-2181-4220-9c58-afdd57a7a0e7 new file mode 100644 index 0000000000000000000000000000000000000000..2871a001de204da4c0d017b9f1db66ca97536194 --- /dev/null +++ b/docstore/2d25fff4-2181-4220-9c58-afdd57a7a0e7 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#embedding Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/2d38ed2a-6f38-47b9-8b20-8665fd6a7125 b/docstore/2d38ed2a-6f38-47b9-8b20-8665fd6a7125 new file mode 100644 index 0000000000000000000000000000000000000000..465170b0f695eefaa56d7f3871f4f3df2424da62 --- /dev/null +++ b/docstore/2d38ed2a-6f38-47b9-8b20-8665fd6a7125 @@ -0,0 +1 @@ +brightness color_temp: Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. Returns: A dictionary containing the set brightness and color temperature. """ return { "brightness" : brightness , "colorTemperature" : color_temp } JavaScript import { Type } from '@google/genai' ; // Define a function that the model can call to control smart lights const setLightValuesFunctionDeclaration = { name : 'set_light_values' , description : 'Sets the brightness and color temperature of a light.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'Light level from 0 to 100. Zero is off and 100 is full brightness' , }, color_temp : { type : Type . STRING , enum : [ 'daylight' , 'cool' , 'warm' ], description : 'Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`.' , }, }, required : [ 'brightness' , 'color_temp' ], }, }; /** * Set the brightness and color temperature of a room light. (mock API) * @param {number} brightness - Light level from 0 to 100. Zero is off and 100 is full brightness * @param {string} color_temp - Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. * @return {Object} A dictionary containing the set brightness and color temperature. */ function setLightValues ( brightness , color_temp ) { return { brightness : brightness , colorTemperature : color_temp }; } Step 2: Call the model with function declarations Once you have defined your function declarations, you can prompt the model to use them. It analyzes the prompt and function declarations and decides whether to respond directly or to call a function. If a function is called, the response object will contain a function call suggestion. Python from google.genai import types # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ set_light_values_declaration ]) config = types . GenerateContentConfig ( tools = [ \ No newline at end of file diff --git a/docstore/2d52a08e-4635-471b-b562-79c6ae993927 b/docstore/2d52a08e-4635-471b-b562-79c6ae993927 new file mode 100644 index 0000000000000000000000000000000000000000..696658722d20a37964165e2b0fc6d9de79b0d2b8 --- /dev/null +++ b/docstore/2d52a08e-4635-471b-b562-79c6ae993927 @@ -0,0 +1 @@ +main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); You can enable transcription of the audio input by sending input_audio_transcription in setup config. Python import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ], "input_audio_transcription" : {}, } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_data = Path ( "16000.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_data , mime_type = 'audio/pcm;rate=16000' ) ) async for msg in session . receive (): if msg . server_content . input_transcription : print ( 'Transcript:' , msg . server_content . input_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ], inputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( \ No newline at end of file diff --git a/docstore/2d6dc25b-6b13-4e28-801e-bd47b8299830 b/docstore/2d6dc25b-6b13-4e28-801e-bd47b8299830 new file mode 100644 index 0000000000000000000000000000000000000000..7ea9f8bcfd3306547cf34f86a441b3041d14b40c --- /dev/null +++ b/docstore/2d6dc25b-6b13-4e28-801e-bd47b8299830 @@ -0,0 +1 @@ +Safety guidance | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Safety guidance Generative artificial intelligence models are powerful tools, but they are not without their limitations. Their versatility and applicability can sometimes lead to unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing, and rigorous manual evaluation are essential to limit the risk of harm from such outputs. The models provided by the Gemini API can be used for a wide variety of generative AI and natural language processing (NLP) applications. Use of these functions is only available through the Gemini API or the Google AI Studio web app. Your use of Gemini API is also subject to the Generative AI Prohibited Use Policy and the Gemini API terms of service . Part of what makes large language models (LLMs) so useful is that they're creative tools that can address many different language tasks. Unfortunately, this also means that large language models can generate output that you don't expect, including text that's offensive, insensitive, or factually incorrect. What's more, the incredible versatility of these models is also what makes it difficult to predict exactly what kinds of undesirable output they might produce. While the Gemini API has been designed with Google's AI principles in mind, the onus is on developers to apply these models responsibly. To aid developers in creating safe, responsible applications, the Gemini API has some built-in content filtering as well as adjustable safety settings across 4 dimensions of harm. Refer to the safety settings guide to learn more. This document is meant to introduce you \ No newline at end of file diff --git a/docstore/2d6ecb7b-e84c-4114-bcc1-6f1708be873d b/docstore/2d6ecb7b-e84c-4114-bcc1-6f1708be873d new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/2d6ecb7b-e84c-4114-bcc1-6f1708be873d @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/2d8e987a-ec5c-4743-b9c4-2ced5ed1ebe4 b/docstore/2d8e987a-ec5c-4743-b9c4-2ced5ed1ebe4 new file mode 100644 index 0000000000000000000000000000000000000000..3d0efcbd852506bcdcffe96143ffad9326aef9eb --- /dev/null +++ b/docstore/2d8e987a-ec5c-4743-b9c4-2ced5ed1ebe4 @@ -0,0 +1 @@ +from google import genai client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'Tell me a story in 300 words.' ) print ( response . text ) print ( response . model_dump_json ( exclude_none = True , indent = 4 )) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Tell me a story in 300 words." , }); console . log ( response . text ); Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , genai . Text ( "Tell me a story in 300 words." ), nil ) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing result Image Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ([ 'Tell me a story based on this image' , Image . open ( image_path ) ]) print ( response . text ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); function fileToGenerativePart ( path , mimeType ) { return { inlineData : { data : Buffer . from ( fs . readFileSync ( path )). toString ( "base64" ), mimeType , }, }; } const prompt = "Tell me a story based on this image" ; const imagePart = fileToGenerativePart ( `path/to/organ.jpg` , "image/jpeg" , ); const result = await model . generateContent ([ prompt , imagePart ]); console . log ( result . response . text ()); Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( \ No newline at end of file diff --git a/docstore/2da7ac9b-58a9-48a4-8010-00118e397ad8 b/docstore/2da7ac9b-58a9-48a4-8010-00118e397ad8 new file mode 100644 index 0000000000000000000000000000000000000000..0304d79cd54dd8b1472f98b3520e7fff5493e63f --- /dev/null +++ b/docstore/2da7ac9b-58a9-48a4-8010-00118e397ad8 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/thinking#signatures Title: Gemini thinking | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/2dacd6ba-f84c-4cbc-9d4b-f0958184a49d b/docstore/2dacd6ba-f84c-4cbc-9d4b-f0958184a49d new file mode 100644 index 0000000000000000000000000000000000000000..cb7319bb995c708a315638a1177ba448f5c39210 --- /dev/null +++ b/docstore/2dacd6ba-f84c-4cbc-9d4b-f0958184a49d @@ -0,0 +1 @@ +: "Powers the spinning disco ball." , "parameters" : { "type" : "object" , "properties" : { "power" : { "type" : "boolean" , "description" : "Whether to turn the disco ball on or off." , } }, "required" : [ "power" ], }, } start_music = { "name" : "start_music" , "description" : "Play some music matching the specified parameters." , "parameters" : { "type" : "object" , "properties" : { "energetic" : { "type" : "boolean" , "description" : "Whether the music is energetic or not." , }, "loud" : { "type" : "boolean" , "description" : "Whether the music is loud or not." , }, }, "required" : [ "energetic" , "loud" ], }, } dim_lights = { "name" : "dim_lights" , "description" : "Dim the lights." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "number" , "description" : "The brightness of the lights, 0.0 is off, 1.0 is full." , } }, "required" : [ "brightness" ], }, } JavaScript import { Type } from '@google/genai' ; const powerDiscoBall = { name : 'power_disco_ball' , description : 'Powers the spinning disco ball.' , parameters : { type : Type . OBJECT , properties : { power : { type : Type . BOOLEAN , description : 'Whether to turn the disco ball on or off.' } }, required : [ 'power' ] } }; const startMusic = { name : 'start_music' , description : 'Play some music matching the specified parameters.' , parameters : { type : Type . OBJECT , properties : { energetic : { type : Type . BOOLEAN , description : 'Whether the music is energetic or not.' }, loud : { type : Type . BOOLEAN , description : 'Whether the music is loud or not.' } }, required : [ 'energetic' , 'loud' ] } }; const dimLights = { name : 'dim_lights' , description : 'Dim the lights.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'The brightness of the lights, 0.0 is off, 1.0 is full.' } }, required : [ 'brightness' ] } }; Configure the function calling mode to allow using all of the specified tools. To learn more, \ No newline at end of file diff --git a/docstore/2dae6424-b6da-4aa7-b7fa-07212b2053ab b/docstore/2dae6424-b6da-4aa7-b7fa-07212b2053ab new file mode 100644 index 0000000000000000000000000000000000000000..bebef94a2670f1eebe768f4cb4680db515bfe743 --- /dev/null +++ b/docstore/2dae6424-b6da-4aa7-b7fa-07212b2053ab @@ -0,0 +1 @@ +it's recommended to provide a single message summary to free up the context window for subsequent interactions. See Session Resumption for another method for loading session context. Sending and receiving audio The most common audio example, audio-to-audio , is covered in the Getting started guide. Here's an audio-to-text example that reads a WAV file, sends it in the correct format and receives text output: Python # Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav # Install helpers for converting files: pip install librosa soundfile import asyncio import io from pathlib import Path from google import genai from google.genai import types import soundfile as sf import librosa client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : buffer = io . BytesIO () y , sr = librosa . load ( "sample.wav" , sr = 16000 ) sf . write ( buffer , y , sr , format = 'RAW' , subtype = 'PCM_16' ) buffer . seek ( 0 ) audio_bytes = buffer . read () # If already in correct format, you can use this: # audio_bytes = Path("sample.pcm").read_bytes() await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav // Install helpers for converting files: npm install wavefile import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue \ No newline at end of file diff --git a/docstore/2dc3bd01-7f80-4c97-8392-6d78af735172 b/docstore/2dc3bd01-7f80-4c97-8392-6d78af735172 new file mode 100644 index 0000000000000000000000000000000000000000..e65651f01d027d562514063ab972a5a18175d6fb --- /dev/null +++ b/docstore/2dc3bd01-7f80-4c97-8392-6d78af735172 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/batch-mode#retrieve-batch-results Title: Batch Mode | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/2dc4ab0f-3c0b-4238-a6fd-24002c1ead40 b/docstore/2dc4ab0f-3c0b-4238-a6fd-24002c1ead40 new file mode 100644 index 0000000000000000000000000000000000000000..da790334610df2eb1eb6ac9bb30354d4f1eafe8b --- /dev/null +++ b/docstore/2dc4ab0f-3c0b-4238-a6fd-24002c1ead40 @@ -0,0 +1 @@ +the output you want. Adding these examples can help the model identify the patterns and apply the relationship between the given images and responses to the new example. This is also called "few-shot" learning. In the example below, the initial output is written in sentence form, and also contains the country (Brazil). Suppose you want the output in a different format or style, and you want only the city, not the country. Adding few-shot examples to your prompt can steer the model to respond in the way you want. Prompt Model response Determine the city along with the landmark. The landmark is the Christ the Redeemer statue in Rio de Janeiro, Brazil. Updated prompt Improved response Determine the city along with the landmark. city: Rome, landmark: the Colosseum. city: Beijing, landmark: Forbidden City city: Rio de Janeiro, landmark: Christ the Redeemer statue Break it down step-by-step For complex tasks like the ones that require both visual understanding and reasoning, it can be helpful to split the task into smaller, more straightforward steps. Alternatively, it could also be effective if you directly ask the model to “think step by step” in your prompt. Prompt Model response When will I run out of toilet paper? Soon, you only have 3 rolls left. Updated prompt Improved response 1. First, count how many toilet paper rolls are in this picture. 2. Then, determine how much toilet paper a typical person uses per day. 3. Calculate how long these rolls of toilet paper will last. 1. There are 3 rolls of toilet paper in this picture. 2. A typical person uses about 20 sheets of toilet paper per day. 3. If each roll contains 200 sheets, then each roll will last for about 10 days. Therefore, the 3 rolls will last for about a month. Math problems or other types of word problems are great candidates for asking the model to think step-by-step. Prompt Response What is the 4th term in the sequence? -135 The response from the model is incorrect. Some ways to improve this is to ask \ No newline at end of file diff --git a/docstore/2de9798f-3201-42be-9254-c2a705a2e6a5 b/docstore/2de9798f-3201-42be-9254-c2a705a2e6a5 new file mode 100644 index 0000000000000000000000000000000000000000..989a38b805ed3662f352ccf72b45824dd12e3417 --- /dev/null +++ b/docstore/2de9798f-3201-42be-9254-c2a705a2e6a5 @@ -0,0 +1 @@ +Tool use with Live API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Tool use with Live API Tool use allows Live API to go beyond just conversation by enabling it to perform actions in the real-world and pull in external context while maintaining a real time connection. You can define tools such as Function calling , Code execution , and Google Search with the Live API. Overview of supported tools Here's a brief overview of the available tools for each model: Tool Cascaded models gemini-live-2.5-flash-preview gemini-2.0-flash-live-001 gemini-2.5-flash-preview-native-audio-dialog gemini-2.5-flash-exp-native-audio-thinking-dialog Search Yes Yes Yes Function calling Yes Yes No Code execution Yes No No Url context Yes No No Function calling Live API supports function calling, just like regular content generation requests. Function calling lets the Live API interact with external data and programs, greatly increasing what your applications can accomplish. You can define function declarations as part of the session configuration. After receiving tool calls, the client should respond with a list of FunctionResponse objects using the session.send_tool_response method. See the Function calling tutorial to learn more. Note: Unlike the generateContent API, the Live API doesn't support automatic tool response handling. You must handle tool responses manually in your client code. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" # Simple function definitions turn_on_the_lights = { "name" : "turn_on_the_lights" } turn_off_the_lights = { "name" : \ No newline at end of file diff --git a/docstore/2dfb84a0-06d3-4e28-b171-369995448223 b/docstore/2dfb84a0-06d3-4e28-b171-369995448223 new file mode 100644 index 0000000000000000000000000000000000000000..8c04b7a4dc0b855882a3249b591f0ae2ee5cb8ef --- /dev/null +++ b/docstore/2dfb84a0-06d3-4e28-b171-369995448223 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/pricing#veo-2 Title: Gemini Developer API Pricing | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/2e0d17d7-a2e3-4dfb-9628-f95408f9ad83 b/docstore/2e0d17d7-a2e3-4dfb-9628-f95408f9ad83 new file mode 100644 index 0000000000000000000000000000000000000000..41dedb01cb0b9c984f39578d0001dc7776e6fe12 --- /dev/null +++ b/docstore/2e0d17d7-a2e3-4dfb-9628-f95408f9ad83 @@ -0,0 +1 @@ +, 'IMAGE' ] ) ) for part in response . candidates [ 0 ] . content . parts : if part . text is not None : print ( part . text ) elif part . inline_data is not None : image = Image . open ( BytesIO (( part . inline_data . data ))) image . show () JavaScript Note: We've released the Google SDK for TypeScript and JavaScript in preview launch stage . Use this SDK for image generation features. import { GoogleGenAI , Modality } from "@google/genai" ; import * as fs from "node:fs" ; async function main () { const ai = new GoogleGenAI ({}); // Load the image from the local file system const imagePath = "path/to/image.png" ; const imageData = fs . readFileSync ( imagePath ); const base64Image = imageData . toString ( "base64" ); // Prepare the content parts const contents = [ { text : "Can you add a llama next to the image?" }, { inlineData : { mimeType : "image/png" , data : base64Image , }, }, ]; // Set responseModalities to include "Image" so the model can generate an image const response = await ai . models . generateContent ({ model : "gemini-2.0-flash-preview-image-generation" , contents : contents , config : { responseModalities : [ Modality . TEXT , Modality . IMAGE ], }, }); for ( const part of response . candidates [ 0 ]. content . parts ) { // Based on the part type, either show the text or save the image if ( part . text ) { console . log ( part . text ); } else if ( part . inlineData ) { const imageData = part . inlineData . data ; const buffer = Buffer . from ( imageData , "base64" ); fs . writeFileSync ( "gemini-native-image.png" , buffer ); console . log ( "Image saved as gemini-native-image.png" ); } } } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imagePath := "/path/to/image.png" imgData , _ := os . ReadFile ( imagePath ) parts := [] * genai . Part { genai . NewPartFromText ( "Hi, This is \ No newline at end of file diff --git a/docstore/2e1df632-0f79-49e5-8bc5-c141b07d4e73 b/docstore/2e1df632-0f79-49e5-8bc5-c141b07d4e73 new file mode 100644 index 0000000000000000000000000000000000000000..43d235dee43f472c269052714c9705874463b9cd --- /dev/null +++ b/docstore/2e1df632-0f79-49e5-8bc5-c141b07d4e73 @@ -0,0 +1 @@ +tools ]) # Define user prompt contents = [ types . Content ( role = "user" , parts = [ types . Part ( text = "Turn the lights down to a romantic level" )] ) ] # Send request with function declarations response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = contents config = config , ) print ( response . candidates [ 0 ] . content . parts [ 0 ] . function_call ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Generation config with function declaration const config = { tools : [{ functionDeclarations : [ setLightValuesFunctionDeclaration ] }] }; // Configure the client const ai = new GoogleGenAI ({}); // Define user prompt const contents = [ { role : 'user' , parts : [{ text : 'Turn the lights down to a romantic level' }] } ]; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( response . functionCalls [ 0 ]); The model then returns a functionCall object in an OpenAPI compatible schema specifying how to call one or more of the declared functions in order to respond to the user's question. Python id = None args = { 'color_temp' : 'warm' , 'brightness' : 25 } name = 'set_light_values' JavaScript { name : 'set_light_values' , args : { brightness : 25 , color_temp : 'warm' } } Step 3: Execute set_light_values function code Extract the function call details from the model's response, parse the arguments , and execute the set_light_values function. Python # Extract tool call details, it may not be in the first part. tool_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call if tool_call . name == "set_light_values" : result = set_light_values ( ** tool_call . args ) print ( f "Function execution result: { result } " ) JavaScript // Extract tool call details const tool_call = response . functionCalls [ 0 ] let result ; if ( tool_call . name === 'set_light_values' ) { result = \ No newline at end of file diff --git a/docstore/2e38597f-6231-4e96-b9f3-6dcb60dc91af b/docstore/2e38597f-6231-4e96-b9f3-6dcb60dc91af new file mode 100644 index 0000000000000000000000000000000000000000..6b49de6482bf764efc991923a68119bf33fab745 --- /dev/null +++ b/docstore/2e38597f-6231-4e96-b9f3-6dcb60dc91af @@ -0,0 +1 @@ +sessions using the token from this request ( newSessionExpireTime ), and 30 minutes to send messages over that connection ( expireTime ). Python import datetime now = datetime . datetime . now ( tz = datetime . timezone . utc ) client = genai . Client ( http_options = { 'api_version' : 'v1alpha' ,} ) token = client . auth_tokens . create ( config = { 'uses' : 1 , # The ephemeral token can only be used to start a single session 'expire_time' : now + datetime . timedelta ( minutes = 30 ), # Default is 30 minutes in the future # 'expire_time': '2025-05-17T00:00:00Z', # Accepts isoformat. 'new_session_expire_time' : now + datetime . timedelta ( minutes = 1 ), # Default 1 minute in the future 'http_options' : { 'api_version' : 'v1alpha' }, } ) # You'll need to pass the value under token.name back to your client to use it JavaScript import { GoogleGenAI } from "@google/genai" ; const client = new GoogleGenAI ({}); const expireTime = new Date ( Date . now () + 30 * 60 * 1000 ). toISOString (); const token : AuthToken = await client . authTokens . create ({ config : { uses : 1 , // The default expireTime : expireTime // Default is 30 mins newSessionExpireTime : new Date ( Date . now () + ( 1 * 60 * 1000 )), // Default 1 minute in the future httpOptions : { apiVersion : 'v1alpha' }, }, }); For expireTime value constraints, defaults, and other field specs, see the API reference . Within the expireTime timeframe, you'll need sessionResumption to reconnect the call every 10 minutes (this can be done with the same token even if uses: 1 ). It's also possible to lock an ephemeral token to a set of configurations. This might be useful to further improve security of your application and keep your system instructions on the server side. Python client = genai . Client ( http_options = { 'api_version' : 'v1alpha' ,} ) token = client . auth_tokens . create ( config = { 'uses' : 1 , 'live_connect_constraints' : { 'model' : 'gemini-2.0-flash-live-001' , 'config' : { 'session_resumption' \ No newline at end of file diff --git a/docstore/2e4525f7-ebec-4881-b85e-640799f85d50 b/docstore/2e4525f7-ebec-4881-b85e-640799f85d50 new file mode 100644 index 0000000000000000000000000000000000000000..964387d039ef80cfbc57b203b39f125290e566d4 --- /dev/null +++ b/docstore/2e4525f7-ebec-4881-b85e-640799f85d50 @@ -0,0 +1 @@ +using the text below. Respond with only the text provided. Question: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Text: Color: Slowly pulsing yellow What it means: There is a network error. What to do: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. Color: Fast blinking yellow What it means: You are holding down the reset button and are factory resetting this device. What to do: If you keep holding down the reset button, after about 12 seconds, the light will turn solid yellow. Once it is solid yellow, let go of the factory reset button. Color: Solid yellow What it means: Router is factory resetting. What to do: This can take up to 10 minutes. When it's done, the device will reset itself and start pulsing white, letting you know it's ready for setup. Color: Solid red What it means: Something is wrong. What to do: Critical failure. Factory reset the router. If the light stays red, contact Wifi customer support. Response: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. (gemini-2.5-flash) Add prefixes A prefix is a word or phrase that you add to the prompt content that can serve several purposes, depending on where you put the prefix: Input prefix: Adding a prefix to the input signals semantically meaningful parts of the input to the model. For example, the prefixes "English:" and "French:" demarcate two different languages. Output prefix: Even though the output is generated by the model, you can add a prefix for the output in the prompt. The output prefix gives the model information about what's expected as a response. For example, the output prefix "JSON:" signals to the model that the output should be in JSON format. Example prefix: In few-shot prompts, adding prefixes \ No newline at end of file diff --git a/docstore/2e46d0cc-ebd4-4fb7-9b97-f6de608026ee b/docstore/2e46d0cc-ebd4-4fb7-9b97-f6de608026ee new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/2e46d0cc-ebd4-4fb7-9b97-f6de608026ee @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/2e4ca181-7e0b-4505-a267-1a7798e4594e b/docstore/2e4ca181-7e0b-4505-a267-1a7798e4594e new file mode 100644 index 0000000000000000000000000000000000000000..2e8ac78d46cd543ec267658a40dc30e0a2feb077 --- /dev/null +++ b/docstore/2e4ca181-7e0b-4505-a267-1a7798e4594e @@ -0,0 +1 @@ +Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { SystemInstruction : genai . NewContentFromText ( "You are a cat. Your name is Neko." , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Hello there" ), config , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "system_instruction": { "parts": [ { "text": "You are a cat. Your name is Neko." } ] }, "contents": [ { "parts": [ { "text": "Hello there" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const systemInstruction = { parts : [{ text : 'You are a cat. Your name is Neko.' }] }; const payload = { systemInstruction , contents : [ { parts : [ { text : 'Hello there' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } The GenerateContentConfig object also lets you override default generation parameters, such as temperature . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model \ No newline at end of file diff --git a/docstore/2e541ff7-2812-42a3-bb0e-bb8ec6420ebd b/docstore/2e541ff7-2812-42a3-bb0e-bb8ec6420ebd new file mode 100644 index 0000000000000000000000000000000000000000..8562c6ca5d2a89dac90935227121a5fd486f1f09 --- /dev/null +++ b/docstore/2e541ff7-2812-42a3-bb0e-bb8ec6420ebd @@ -0,0 +1 @@ +establishing your core idea, and then refine and expand upon that core idea until the generated image is close to your vision. Prompt: A park in the spring next to a lake Prompt: A park in the spring next to a lake, the sun sets across the lake, golden hour Prompt: A park in the spring next to a lake, the sun sets across the lake, golden hour, red wildflowers Imagen models can transform your ideas into detailed images, whether your prompts are short or long and detailed. Refine your vision through iterative prompting, adding details until you achieve the perfect result. Short prompts let you generate an image quickly. Prompt: close-up photo of a woman in her 20s, street photography, movie still, muted orange warm tones Longer prompts let you add specific details and build your image. Prompt: captivating photo of a woman in her 20s utilizing a street photography style. The image should look like a movie still with muted orange warm tones. Additional advice for Imagen prompt writing: Use descriptive language : Employ detailed adjectives and adverbs to paint a clear picture for Imagen. Provide context : If necessary, include background information to aid the AI's understanding. Reference specific artists or styles : If you have a particular aesthetic in mind, referencing specific artists or art movements can be helpful. Use prompt engineering tools : Consider exploring prompt engineering tools or resources to help you refine your prompts and achieve optimal results. Enhancing the facial details in your personal and group images : Specify facial details as a focus of the photo (for example, use the word "portrait" in the prompt). Generate text in images Imagen models can add text into images, opening up more creative image generation possibilities. Use the following guidance to get the most out of this feature: Iterate with confidence : You might have to regenerate images until you achieve the look you want. Imagen's text integration is still evolving, and sometimes \ No newline at end of file diff --git a/docstore/2e584681-6787-4b2e-8435-6803fb9819c3 b/docstore/2e584681-6787-4b2e-8435-6803fb9819c3 new file mode 100644 index 0000000000000000000000000000000000000000..5db8bfd80021d23905dc72cb076a591abebf4c74 --- /dev/null +++ b/docstore/2e584681-6787-4b2e-8435-6803fb9819c3 @@ -0,0 +1 @@ +Tahoe?" using the Get Weather example: Text part [{ "candidates" : [ { "content" : { "parts" : [ { "text" : "Here's what the weather in Lake Tahoe is today" , "thoughtSignature" : "ClcBVKhc7ru7KzUI7SrdUoIdAYLm/+i93aHjfIt4xHyAoO/G70tApxnK2ujBhOhC1PrRy1pkQa88fqFvpHNVd1HDjNLO7mkp6/hFwE+SPPEB3fh0hs4oM8MKhgIBVKhc7uIGvrS7i/T4HpfbnYrluFfWNjZ62gewqe4cVdR/Dlh+zbjtYmDD0gPZ+SuBO7vvHQdzsjePRP+2Y5XddX6LEf/cGGgakq8EhVvw/a6IVzUO6XmpHg2Ag1sl8E9+VFH/lC0R0ZuYdFWligtDuYwp5p5q3o59G0TtWeU2MC1y2MJfE9u/KWd313ldka80/X2W/xF2O/4djMp5G2WKcULfve75zeRCy0mc5iS3SB9mTH0cT6x0vtKjeBx50gcg+CQWtJcRuwTVzz54dmvmK9xvnqA8gKGw3DuaM9wfy5hyY7Qg0z3iyyWdP8T/lbjKim8IEQOk7O1vVwP1Ko7oMYH8JgA1CsoBAVSoXO6v4c5RSyd1cn6EIU0pEFQsjW7rYWPuZdOFq/tsGJT9BCfW7KGkPGwlNSq8jTJFvbcJ/DjtndISQYXwiXd2kGa5JfdS2Kh4zOxCxiWtOk+2nCc3+XQk2nonhO+esGJpkDdbbHZSqRgcUtYKq7q28iPFOQvOFyCiZNB7K86Z/6Hnagu2snSlN/BcTMaFGaWpcCClSUo4foRZn3WbNCoM8rcpD7qEJMp4a5baaSxyyeL1ZTGd2HLpFys/oiW6e3oAnhxuIysCwg==" } ] , "role" : "model" } , "index" : 0 } ] , # Remainder of response... Function call part [{ "candidates" : [ { "content" : { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : "CiwBVKhc7nRyTi3HmggPD9iQiRc261f5jwuMdw3H/itDH0emsb9ZVo3Nwx9p6wpsAVSoXO5i8fDV4jBSBLoaWxB5zUdlGY6aIGp+I0oEnwRRSRQ1LOvrDlojEH8JE8HjiKXALdJrvNPiG+HY3GZEO8pZjEZtc3UoBUh7+SVyjK7Xolu7aRYYeUyzrCapoETWypER1jbrJXnFV23hCosBAVSoXO6oIPNJSmbuEDfGafOhuCSHkpr1yjTp35RXYqmCESzRzWf5+nFXLqncqeFo4ohoxbiYQVpVQbOZF81p8o9zg6xeRE7qMeOv+XN7enXGJ4/s3qNFQpfkSMqRdBITN1VpX7jyfEAjvxBNc7PDfDJZmEPY338ZIY5nFFcmzJSWjVrboFt2sMFv+A==" } ] , "role" : "model" } , "finishReason" : "STOP" , "index" : 0 } ] , # Remainder of response... You can confirm that you received a signature and see what a signature looks like using the following code: # Step 2: Call the model with function declarations # ...Generation config, Configure the client, and Define user prompt (No changes) # Send request with declarations (using a thinking model) response = client . \ No newline at end of file diff --git a/docstore/2e651687-7a54-4066-9db1-1c6d3a39c5f2 b/docstore/2e651687-7a54-4066-9db1-1c6d3a39c5f2 new file mode 100644 index 0000000000000000000000000000000000000000..aef01da97801860cabcd3fb68af1ef57ccf11af0 --- /dev/null +++ b/docstore/2e651687-7a54-4066-9db1-1c6d3a39c5f2 @@ -0,0 +1 @@ +Speech generation (text-to-speech) | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Speech generation (text-to-speech) The Gemini API can transform text input into single speaker or multi-speaker audio using native text-to-speech (TTS) generation capabilities. Text-to-speech (TTS) generation is controllable , meaning you can use natural language to structure interactions and guide the style , accent , pace , and tone of the audio. The TTS capability differs from speech generation provided through the Live API , which is designed for interactive, unstructured audio, and multimodal inputs and outputs. While the Live API excels in dynamic conversational contexts, TTS through the Gemini API is tailored for scenarios that require exact text recitation with fine-grained control over style and sound, such as podcast or audiobook generation. This guide shows you how to generate single-speaker and multi-speaker audio from text. Preview: Native text-to-speech (TTS) is in Preview . Before you begin Ensure you use a Gemini 2.5 model variant with native text-to-speech (TTS) capabilities, as listed in the Supported models section. For optimal results, consider which model best fits your specific use case. You may find it useful to test the Gemini 2.5 TTS models in AI Studio before you start building. Note: TTS models accept text-only inputs and produce audio-only outputs. For a complete list of restrictions specific to TTS models, review the Limitations section. Single-speaker text-to-speech To convert text to single-speaker audio, set the response modality to "audio", and pass a SpeechConfig object with VoiceConfig set. You'll need to choose a \ No newline at end of file diff --git a/docstore/2e654670-9830-42a9-a0ed-bbd05468c783 b/docstore/2e654670-9830-42a9-a0ed-bbd05468c783 new file mode 100644 index 0000000000000000000000000000000000000000..e76da6c207eae1da557e785101f2cf3e4287370a --- /dev/null +++ b/docstore/2e654670-9830-42a9-a0ed-bbd05468c783 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/video-understanding#main-content Title: Video understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/2e736f1b-a046-4a8c-8f20-135722585da5 b/docstore/2e736f1b-a046-4a8c-8f20-135722585da5 new file mode 100644 index 0000000000000000000000000000000000000000..da790334610df2eb1eb6ac9bb30354d4f1eafe8b --- /dev/null +++ b/docstore/2e736f1b-a046-4a8c-8f20-135722585da5 @@ -0,0 +1 @@ +the output you want. Adding these examples can help the model identify the patterns and apply the relationship between the given images and responses to the new example. This is also called "few-shot" learning. In the example below, the initial output is written in sentence form, and also contains the country (Brazil). Suppose you want the output in a different format or style, and you want only the city, not the country. Adding few-shot examples to your prompt can steer the model to respond in the way you want. Prompt Model response Determine the city along with the landmark. The landmark is the Christ the Redeemer statue in Rio de Janeiro, Brazil. Updated prompt Improved response Determine the city along with the landmark. city: Rome, landmark: the Colosseum. city: Beijing, landmark: Forbidden City city: Rio de Janeiro, landmark: Christ the Redeemer statue Break it down step-by-step For complex tasks like the ones that require both visual understanding and reasoning, it can be helpful to split the task into smaller, more straightforward steps. Alternatively, it could also be effective if you directly ask the model to “think step by step” in your prompt. Prompt Model response When will I run out of toilet paper? Soon, you only have 3 rolls left. Updated prompt Improved response 1. First, count how many toilet paper rolls are in this picture. 2. Then, determine how much toilet paper a typical person uses per day. 3. Calculate how long these rolls of toilet paper will last. 1. There are 3 rolls of toilet paper in this picture. 2. A typical person uses about 20 sheets of toilet paper per day. 3. If each roll contains 200 sheets, then each roll will last for about 10 days. Therefore, the 3 rolls will last for about a month. Math problems or other types of word problems are great candidates for asking the model to think step-by-step. Prompt Response What is the 4th term in the sequence? -135 The response from the model is incorrect. Some ways to improve this is to ask \ No newline at end of file diff --git a/docstore/2e7d60ea-8137-43f5-8d6f-e750f5e13375 b/docstore/2e7d60ea-8137-43f5-8d6f-e750f5e13375 new file mode 100644 index 0000000000000000000000000000000000000000..cc967ffee72bfb803a27c22b45a7304a203928ac --- /dev/null +++ b/docstore/2e7d60ea-8137-43f5-8d6f-e750f5e13375 @@ -0,0 +1 @@ +the model to parse the problem before answering the prompt. Prompt Response Parse the formula in the image first. Then based on the formula, answer "what's the 4th term in the sequence?". Click to expand the result $$ b(n) = b(n-1) * (-3) $$ $$ b(1) = 15 $$ Substitute n = 4 into the formula: $$ b(4) = b(4-1) * (-3) $$ $$ b(4) = b(3) * (-3) $$ We don't know the value of b(3), so substitute n = 3 into the formula: $$ b(3) = b(3-1) * (-3) $$ $$ b(3) = b(2) * (-3) $$ We don't know the value of b(2), so substitute n = 2 into the formula: $$ b(2) = b(2-1) * (-3) $$ $$ b(2) = b(1) * (-3) $$ We know the value of b(1) is 15, so substitute it into the formula: $$ b(2) = 15 * (-3) $$ $$ b(2) = -45 $$ Now that we know the value of b(2), substitute it into the formula for b(3): $$ b(3) = b(2) * (-3) $$ $$ b(3) = -45 * (-3) $$ $$ b(3) = 135 $$ Now that we know the value of b(3), substitute it into the formula for b(4): $$ b(4) = b(3) * (-3) $$ $$ b(4) = 135 * (-3) $$ $$ b(4) = -405 $$ Therefore, the 4th term in the sequence is -405. This model response is correct - now to make the response even more streamlined, you can ask the model to to think step-by-step. Updated prompt Improved response What is the 4th term in the sequence? Think step by step. The first term is 15. The second term is b(1) * (-3) = 15 * (-3) = -45. The third term is b(2) * (-3) = -45 * (-3) = 135. The fourth term is b(3) * (-3) = 135 * (-3) = -405. Specify the output format A common problem is the need for model output to be in a certain format (e.g. markdown, JSON, HTML), especially if the model output needs to be ingested by a downstream task. You can try to produce output in that format by instructing the model to do so within the prompt. Below are two examples: Prompt Model response Parse the table in this image into markdown format | Subject | Status | Time | Start date | Due on | | ------- | ------ | ---- | ---------- | ------ | | Math | Done | 1.5 hrs | 11/29/2023 | 12/1/2023 | | Geography | Done | 30 \ No newline at end of file diff --git a/docstore/2e877f3b-2bdd-45f9-83f4-61e4ae6331e8 b/docstore/2e877f3b-2bdd-45f9-83f4-61e4ae6331e8 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/2e877f3b-2bdd-45f9-83f4-61e4ae6331e8 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/2eb8ee29-51f9-41f5-83be-f560a9bdb9de b/docstore/2eb8ee29-51f9-41f5-83be-f560a9bdb9de new file mode 100644 index 0000000000000000000000000000000000000000..1b38806cd93f51329872622740eee5221d7a7d7e --- /dev/null +++ b/docstore/2eb8ee29-51f9-41f5-83be-f560a9bdb9de @@ -0,0 +1 @@ +18°C." , config = config , ) # Print the final, user-facing response print ( response . text ) Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . I 've set the thermostat to 20°C. JavaScript This example shows how to use JavaScript/TypeScript SDK to do comopositional function calling using a manual execution loop. import { GoogleGenAI , Type } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Example Functions function get_weather_forecast ({ location }) { console . log ( `Tool Call: get_weather_forecast(location= ${ location } )` ); // TODO: Make API call console . log ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ); return { temperature : 25 , unit : "celsius" }; } function set_thermostat_temperature ({ temperature }) { console . log ( `Tool Call: set_thermostat_temperature(temperature= ${ temperature } )` , ); // TODO: Make API call console . log ( "Tool Response: {'status': 'success'}" ); return { status : "success" }; } const toolFunctions = { get_weather_forecast , set_thermostat_temperature , }; const tools = [ { functionDeclarations : [ { name : "get_weather_forecast" , description : "Gets the current weather temperature for a given location." , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , }, }, required : [ "location" ], }, }, { name : "set_thermostat_temperature" , description : "Sets the thermostat to a desired temperature." , parameters : { type : Type . OBJECT , properties : { temperature : { type : Type . NUMBER , }, }, required : [ \ No newline at end of file diff --git a/docstore/2ebe7bc4-0123-40dd-ba1b-36558b90ac38 b/docstore/2ebe7bc4-0123-40dd-ba1b-36558b90ac38 new file mode 100644 index 0000000000000000000000000000000000000000..8f7945b0bd22308457df570e5259c405fbd173ad --- /dev/null +++ b/docstore/2ebe7bc4-0123-40dd-ba1b-36558b90ac38 @@ -0,0 +1 @@ +and 100 is full brightness' , }, color_temp : { type : Type . STRING , enum : [ 'daylight' , 'cool' , 'warm' ], description : 'Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`.' , }, }, required : [ 'brightness' , 'color_temp' ], }, }; /** * Set the brightness and color temperature of a room light. (mock API) * @param {number} brightness - Light level from 0 to 100. Zero is off and 100 is full brightness * @param {string} color_temp - Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. * @return {Object} A dictionary containing the set brightness and color temperature. */ function setLightValues ( brightness , color_temp ) { return { brightness : brightness , colorTemperature : color_temp }; } Step 2: Call the model with function declarations Once you have defined your function declarations, you can prompt the model to use them. It analyzes the prompt and function declarations and decides whether to respond directly or to call a function. If a function is called, the response object will contain a function call suggestion. Python from google.genai import types # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ set_light_values_declaration ]) config = types . GenerateContentConfig ( tools = [ tools ]) # Define user prompt contents = [ types . Content ( role = "user" , parts = [ types . Part ( text = "Turn the lights down to a romantic level" )] ) ] # Send request with function declarations response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = contents config = config , ) print ( response . candidates [ 0 ] . content . parts [ 0 ] . function_call ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Generation config with function declaration const config = { tools : [{ functionDeclarations : [ setLightValuesFunctionDeclaration ] }] }; // Configure the client const ai = new GoogleGenAI ({}); // Define user \ No newline at end of file diff --git a/docstore/2ebfcebe-5333-4365-9507-c05076e2cb13 b/docstore/2ebfcebe-5333-4365-9507-c05076e2cb13 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/2ebfcebe-5333-4365-9507-c05076e2cb13 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/2ec722a5-9429-4368-b14e-65fa73ae4386 b/docstore/2ec722a5-9429-4368-b14e-65fa73ae4386 new file mode 100644 index 0000000000000000000000000000000000000000..bebef94a2670f1eebe768f4cb4680db515bfe743 --- /dev/null +++ b/docstore/2ec722a5-9429-4368-b14e-65fa73ae4386 @@ -0,0 +1 @@ +it's recommended to provide a single message summary to free up the context window for subsequent interactions. See Session Resumption for another method for loading session context. Sending and receiving audio The most common audio example, audio-to-audio , is covered in the Getting started guide. Here's an audio-to-text example that reads a WAV file, sends it in the correct format and receives text output: Python # Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav # Install helpers for converting files: pip install librosa soundfile import asyncio import io from pathlib import Path from google import genai from google.genai import types import soundfile as sf import librosa client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : buffer = io . BytesIO () y , sr = librosa . load ( "sample.wav" , sr = 16000 ) sf . write ( buffer , y , sr , format = 'RAW' , subtype = 'PCM_16' ) buffer . seek ( 0 ) audio_bytes = buffer . read () # If already in correct format, you can use this: # audio_bytes = Path("sample.pcm").read_bytes() await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav // Install helpers for converting files: npm install wavefile import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue \ No newline at end of file diff --git a/docstore/2ece808b-24a0-4058-9dc8-977e4fdccdcf b/docstore/2ece808b-24a0-4058-9dc8-977e4fdccdcf new file mode 100644 index 0000000000000000000000000000000000000000..b9a0fd4b25842077eb6b6b7137ead6b05cace053 --- /dev/null +++ b/docstore/2ece808b-24a0-4058-9dc8-977e4fdccdcf @@ -0,0 +1 @@ +record which response you preferred. This is the core of the feedback we're collecting. Usage Details: This includes information about which model generated the response and other technical and operational details about your usage of this feature. Your Privacy We take your privacy seriously. Google takes steps to protect your privacy as part of this process. This includes disconnecting this data from your Google Account, API key, and Cloud project before reviewers see or annotate it. Do not submit feedback on conversations that include sensitive, confidential, or personal information. Opting Out You'll have the option to skip the Inline Preference Voting when it appears. Thank you for helping us improve Google AI Studio! Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-03-24 UTC. \ No newline at end of file diff --git a/docstore/2ed3c698-000c-473b-82f2-5bff62e828ab b/docstore/2ed3c698-000c-473b-82f2-5bff62e828ab new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/2ed3c698-000c-473b-82f2-5bff62e828ab @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/2ee17aed-45f6-403f-a8ac-5eae742003ec b/docstore/2ee17aed-45f6-403f-a8ac-5eae742003ec new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/2ee17aed-45f6-403f-a8ac-5eae742003ec @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/2ef73de0-3d5b-4b15-be5f-314d2a71135b b/docstore/2ef73de0-3d5b-4b15-be5f-314d2a71135b new file mode 100644 index 0000000000000000000000000000000000000000..c368fa6354f991e53179efec4d2408d87bac80a6 --- /dev/null +++ b/docstore/2ef73de0-3d5b-4b15-be5f-314d2a71135b @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash-lite Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/2efb1394-6429-49da-b226-5ccc6433e012 b/docstore/2efb1394-6429-49da-b226-5ccc6433e012 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/2efb1394-6429-49da-b226-5ccc6433e012 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/2f00a955-48a5-4355-80f3-c5a194734d30 b/docstore/2f00a955-48a5-4355-80f3-c5a194734d30 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/2f00a955-48a5-4355-80f3-c5a194734d30 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/2f05a66b-fb17-413f-8b79-835c05c64308 b/docstore/2f05a66b-fb17-413f-8b79-835c05c64308 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/2f05a66b-fb17-413f-8b79-835c05c64308 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/2f0658a2-4419-4ad0-a7bf-f464d2da0b13 b/docstore/2f0658a2-4419-4ad0-a7bf-f464d2da0b13 new file mode 100644 index 0000000000000000000000000000000000000000..a1daa64b7091fa9e5594d9222d90d8c8ad6521d2 --- /dev/null +++ b/docstore/2f0658a2-4419-4ad0-a7bf-f464d2da0b13 @@ -0,0 +1 @@ +). You can listen to all the voices in AI Studio . To specify a voice, set the voice name within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "voice_config" : { "prebuilt_voice_config" : { "voice_name" : "Kore" }} }, } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" } } } }; Note: If you're using the generateContent API, the set of available voices is slightly different. See the audio generation guide for generateContent audio generation voices. The Live API supports multiple languages . To change the language, set the language code within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "language_code" : "de-DE" } } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { languageCode : "de-DE" } }; Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Native audio capabilities The following capabilities are only available with native audio. You can learn more about native audio in Choose a model and audio generation . Note: Native audio models currently have limited tool use support. See Overview of supported tools for details. How to use native audio output To use native audio output, configure one of the native audio models and set response_modalities to AUDIO . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-preview-native-audio-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio . live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-preview-native-audio-dialog' ; const config = { responseModalities : [ \ No newline at end of file diff --git a/docstore/2f07efa3-4813-4d77-9133-09e08ca4adf9 b/docstore/2f07efa3-4813-4d77-9133-09e08ca4adf9 new file mode 100644 index 0000000000000000000000000000000000000000..6753b1d43ae21393f7ea777aad5afc4becddf540 --- /dev/null +++ b/docstore/2f07efa3-4813-4d77-9133-09e08ca4adf9 @@ -0,0 +1 @@ +response will include a thought_signature field containing an encrypted representation of the model's reasoning. Return the signature: When you send the function's execution result back to the server, include the thought_signature you received. This allows the model to restore its previous thinking context and will likely result in better function calling performance. Receiving signatures from the server Signatures are returned in the part after the model's thinking phase, which typically is a text or function call. Here are some examples of what thought signatures look like returned in each type of part, in response to the request "What's the weather in Lake Tahoe?" using the Get Weather example: Text part [{ "candidates" : [ { "content" : { "parts" : [ { "text" : "Here's what the weather in Lake Tahoe is today" , "thoughtSignature" : "ClcBVKhc7ru7KzUI7SrdUoIdAYLm/+i93aHjfIt4xHyAoO/G70tApxnK2ujBhOhC1PrRy1pkQa88fqFvpHNVd1HDjNLO7mkp6/hFwE+SPPEB3fh0hs4oM8MKhgIBVKhc7uIGvrS7i/T4HpfbnYrluFfWNjZ62gewqe4cVdR/Dlh+zbjtYmDD0gPZ+SuBO7vvHQdzsjePRP+2Y5XddX6LEf/cGGgakq8EhVvw/a6IVzUO6XmpHg2Ag1sl8E9+VFH/lC0R0ZuYdFWligtDuYwp5p5q3o59G0TtWeU2MC1y2MJfE9u/KWd313ldka80/X2W/xF2O/4djMp5G2WKcULfve75zeRCy0mc5iS3SB9mTH0cT6x0vtKjeBx50gcg+CQWtJcRuwTVzz54dmvmK9xvnqA8gKGw3DuaM9wfy5hyY7Qg0z3iyyWdP8T/lbjKim8IEQOk7O1vVwP1Ko7oMYH8JgA1CsoBAVSoXO6v4c5RSyd1cn6EIU0pEFQsjW7rYWPuZdOFq/tsGJT9BCfW7KGkPGwlNSq8jTJFvbcJ/DjtndISQYXwiXd2kGa5JfdS2Kh4zOxCxiWtOk+2nCc3+XQk2nonhO+esGJpkDdbbHZSqRgcUtYKq7q28iPFOQvOFyCiZNB7K86Z/6Hnagu2snSlN/BcTMaFGaWpcCClSUo4foRZn3WbNCoM8rcpD7qEJMp4a5baaSxyyeL1ZTGd2HLpFys/oiW6e3oAnhxuIysCwg==" } ] , "role" : "model" } , "index" : 0 } ] , # Remainder of response... Function call part [{ "candidates" : [ { "content" : { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/2f1440ac-969a-49fd-8bd3-571ca2057e4b b/docstore/2f1440ac-969a-49fd-8bd3-571ca2057e4b new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/2f1440ac-969a-49fd-8bd3-571ca2057e4b @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/2f1dfae1-e8fd-40cf-ac3d-6aaec1d8a6d8 b/docstore/2f1dfae1-e8fd-40cf-ac3d-6aaec1d8a6d8 new file mode 100644 index 0000000000000000000000000000000000000000..28f0121ffac96a0ebaad59043c8ea741c7484a9d --- /dev/null +++ b/docstore/2f1dfae1-e8fd-40cf-ac3d-6aaec1d8a6d8 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/api-key#provide-api-key-explicitly Title: Using Gemini API keys | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/2f2c1e1f-502d-4574-a826-e098778190c8 b/docstore/2f2c1e1f-502d-4574-a826-e098778190c8 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/2f2c1e1f-502d-4574-a826-e098778190c8 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/2f30ede9-1908-4ff2-b480-23abf849492d b/docstore/2f30ede9-1908-4ff2-b480-23abf849492d new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/2f30ede9-1908-4ff2-b480-23abf849492d @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/2f4cfcb4-5a63-4960-aa28-4b3bc4b0077f b/docstore/2f4cfcb4-5a63-4960-aa28-4b3bc4b0077f new file mode 100644 index 0000000000000000000000000000000000000000..a1daa64b7091fa9e5594d9222d90d8c8ad6521d2 --- /dev/null +++ b/docstore/2f4cfcb4-5a63-4960-aa28-4b3bc4b0077f @@ -0,0 +1 @@ +). You can listen to all the voices in AI Studio . To specify a voice, set the voice name within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "voice_config" : { "prebuilt_voice_config" : { "voice_name" : "Kore" }} }, } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" } } } }; Note: If you're using the generateContent API, the set of available voices is slightly different. See the audio generation guide for generateContent audio generation voices. The Live API supports multiple languages . To change the language, set the language code within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "language_code" : "de-DE" } } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { languageCode : "de-DE" } }; Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Native audio capabilities The following capabilities are only available with native audio. You can learn more about native audio in Choose a model and audio generation . Note: Native audio models currently have limited tool use support. See Overview of supported tools for details. How to use native audio output To use native audio output, configure one of the native audio models and set response_modalities to AUDIO . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-preview-native-audio-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio . live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-preview-native-audio-dialog' ; const config = { responseModalities : [ \ No newline at end of file diff --git a/docstore/2f58f086-8d5b-49f5-87b7-603be890ecea b/docstore/2f58f086-8d5b-49f5-87b7-603be890ecea new file mode 100644 index 0000000000000000000000000000000000000000..3e0dca132b5cee05a4316835e4f2b62d82d3c7fe --- /dev/null +++ b/docstore/2f58f086-8d5b-49f5-87b7-603be890ecea @@ -0,0 +1 @@ +Limits: Function descriptions and parameters count towards your input token limit. If you're hitting token limits, consider limiting the number of functions or the length of the descriptions, break down complex tasks into smaller, more focused function sets. Notes and limitations Only a subset of the OpenAPI schema is supported. Supported parameter types in Python are limited. Automatic function calling is a Python SDK feature only. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC. \ No newline at end of file diff --git a/docstore/2f5d5e37-c848-4691-9dd3-065508f416f9 b/docstore/2f5d5e37-c848-4691-9dd3-065508f416f9 new file mode 100644 index 0000000000000000000000000000000000000000..6acfe7ba02c229c83724a8d4179d85fae6727e02 --- /dev/null +++ b/docstore/2f5d5e37-c848-4691-9dd3-065508f416f9 @@ -0,0 +1 @@ +images before performing its reasoning task. Prompt Model response What is in common between these images? All four images are examples of landscape photography. Updated prompt Improved response First, describe what's in each image in detail. What's in common between these images? The first image shows a person packing a suitcase. The second image shows an empty hallway in a shopping mall. The third image shows a roll of toilet paper. The fourth image shows a dark and stormy sea. The commonality between these images is that they are all examples of liminal spaces. Liminal spaces are places or situations that are in between two other places or situations. They are often characterized by a sense of uncertainty or ambiguity. The first image shows the person in between being at home and being on a trip. The second image shows the hallway in between two stores. The third image is of toilet paper, which can represent a bathroom. A bathroom is a transitional space between private and public realms. It's a place where we prepare ourselves to enter or leave the outside world. The fourth image shows the sea in between being calm and being stormy. Liminal spaces can be both exciting and unsettling. They can be a place of new beginning or a place of endings. They can be a place of reflection or a place of action. You can also try explicitly asking the model to refer to what's in the image. Updated prompt Improved response What is in common between these images? Refer to what's in the images in your response. All of the images are related to travel. The first image shows a person packing a suitcase, the second image shows an empty hallway in a shopping mall, the third image shows a stack of toilet paper, and the fourth image shows a dark and stormy sea. These images are all related to travel because they are all things that people need or use when they travel. The first image shows a person packing a suitcase, which is something that people do when they are going on a trip. The \ No newline at end of file diff --git a/docstore/2f6130ba-0d6d-4dd5-a5eb-bf0252934748 b/docstore/2f6130ba-0d6d-4dd5-a5eb-bf0252934748 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/2f6130ba-0d6d-4dd5-a5eb-bf0252934748 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/2f723802-1171-4c59-aed8-705a7ffca44e b/docstore/2f723802-1171-4c59-aed8-705a7ffca44e new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/2f723802-1171-4c59-aed8-705a7ffca44e @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/2f7e68fd-5d8b-418d-85f9-080fc4c23d48 b/docstore/2f7e68fd-5d8b-418d-85f9-080fc4c23d48 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/2f7e68fd-5d8b-418d-85f9-080fc4c23d48 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/2f8da6ab-8f0a-43fd-9210-421e7861be94 b/docstore/2f8da6ab-8f0a-43fd-9210-421e7861be94 new file mode 100644 index 0000000000000000000000000000000000000000..5a67c041917cdaf904b0e03794a07af474503a9a --- /dev/null +++ b/docstore/2f8da6ab-8f0a-43fd-9210-421e7861be94 @@ -0,0 +1 @@ +upload..." curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D ${ tmp_header_file } \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " echo "Uploading video data..." curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ VIDEO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri echo "File uploaded successfully. File URI: ${ file_uri } " # --- 3. Generate content using the uploaded video file --- echo "Generating content from video..." curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Summarize this video. Then create a quiz with an answer key based on the information in this video."}] }] }' 2 > /dev/null > response.json jq -r ".candidates[].content.parts[].text" response.json To learn more about working with media files, see Files API . Pass video data inline Instead of uploading a video file using the File API, you can pass smaller videos directly in the request to generateContent . This is suitable for shorter videos under 20MB total request size. Here's an example of providing inline video data: Python # Only for videos of size <20Mb video_file_name = "/path/to/your/video.mp4" video_bytes = \ No newline at end of file diff --git a/docstore/2faef00a-2736-48a3-bf7d-b9af69cc22d7 b/docstore/2faef00a-2736-48a3-bf7d-b9af69cc22d7 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/2faef00a-2736-48a3-bf7d-b9af69cc22d7 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/2fc10231-88af-4fa2-b52c-549f70b1c986 b/docstore/2fc10231-88af-4fa2-b52c-549f70b1c986 new file mode 100644 index 0000000000000000000000000000000000000000..2b6e55e3ae415c04ff420e9e56413156ffa5e0fd --- /dev/null +++ b/docstore/2fc10231-88af-4fa2-b52c-549f70b1c986 @@ -0,0 +1 @@ +a sample rate of 24kHz. Python # Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav # Install helpers for converting files: pip install librosa soundfile import asyncio import io from pathlib import Path import wave from google import genai from google.genai import types import soundfile as sf import librosa client = genai . Client () # Half cascade model: # model = "gemini-live-2.5-flash-preview" # Native audio output model: model = "gemini-2.5-flash-preview-native-audio-dialog" config = { "response_modalities" : [ "AUDIO" ], "system_instruction" : "You are a helpful assistant and answer in a friendly tone." , } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : buffer = io . BytesIO () y , sr = librosa . load ( "sample.wav" , sr = 16000 ) sf . write ( buffer , y , sr , format = 'RAW' , subtype = 'PCM_16' ) buffer . seek ( 0 ) audio_bytes = buffer . read () # If already in correct format, you can use this: # audio_bytes = Path("sample.pcm").read_bytes() await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) wf = wave . open ( "audio.wav" , "wb" ) wf . setnchannels ( 1 ) wf . setsampwidth ( 2 ) wf . setframerate ( 24000 ) # Output is 24kHz async for response in session . receive (): if response . data is not None : wf . writeframes ( response . data ) # Un-comment this code to print audio data info # if response.server_content.model_turn is not None: # print(response.server_content.model_turn.parts[0].inline_data.mime_type) wf . close () if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; // npm install wavefile const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); // WARNING: Do not use API keys in \ No newline at end of file diff --git a/docstore/2fcf31cc-0793-4a7a-89fb-1866525d28ee b/docstore/2fcf31cc-0793-4a7a-89fb-1866525d28ee new file mode 100644 index 0000000000000000000000000000000000000000..38b0d511edf99b6e46520aefe0a8854155d4053f --- /dev/null +++ b/docstore/2fcf31cc-0793-4a7a-89fb-1866525d28ee @@ -0,0 +1 @@ +energetic music, and dimmed the lights to 50% brightness. Let's get this party started! Compositional function calling Compositional or sequential function calling allows Gemini to chain multiple function calls together to fulfill a complex request. For example, to answer "Get the temperature in my current location", the Gemini API might first invoke a get_current_location() function followed by a get_weather() function that takes the location as a parameter. The following example demonstrates how to implement compositional function calling using the Python SDK and automatic function calling. Python This example uses the automatic function calling feature of the google-genai Python SDK. The SDK automatically converts the Python functions to the required schema, executes the function calls when requested by the model, and sends the results back to the model to complete the task. import os from google import genai from google.genai import types # Example Functions def get_weather_forecast ( location : str ) - > dict : """Gets the current weather temperature for a given location.""" print ( f "Tool Call: get_weather_forecast(location= { location } )" ) # TODO: Make API call print ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ) return { "temperature" : 25 , "unit" : "celsius" } # Dummy response def set_thermostat_temperature ( temperature : int ) - > dict : """Sets the thermostat to a desired temperature.""" print ( f "Tool Call: set_thermostat_temperature(temperature= { temperature } )" ) # TODO: Interact with a thermostat API print ( "Tool Response: {'status': 'success'}" ) return { "status" : "success" } # Configure the client and model client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_weather_forecast , set_thermostat_temperature ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to \ No newline at end of file diff --git a/docstore/2ff2981c-c475-41d8-890b-b2a81c048c49 b/docstore/2ff2981c-c475-41d8-890b-b2a81c048c49 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/2ff2981c-c475-41d8-890b-b2a81c048c49 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/3003965e-a40e-4f28-b585-636d35f4bdef b/docstore/3003965e-a40e-4f28-b585-636d35f4bdef new file mode 100644 index 0000000000000000000000000000000000000000..109bd0eaeef926ee02f7659dee842724a4ad9423 --- /dev/null +++ b/docstore/3003965e-a40e-4f28-b585-636d35f4bdef @@ -0,0 +1 @@ +the libraries are stable and fully supported for production use. They are actively maintained, provide access to the latest features, and offer the best performance working with Gemini. If you're not using the Google GenAI SDK and using one of our legacy libraries, we strongly recommend you to migrate. Review the legacy libraries section for more information. Legacy libraries and migration If you are using one of our legacy libraries, we recommend that you migrate to the new libraries . The legacy libraries don't provide access to recent features (such as Live API and Veo ) and are on a deprecation path. They will stop receiving updates at the end of September 2025, the feature gaps will grow and potential bugs may no longer get fixed. Each legacy library's support status varies, detailed in the following table: Language Legacy library Support status Recommended library Python google-generativeai All support, including bug fixes, ends end of September 2025. google-genai JavaScript/TypeScript @google/generativeai All support, including bug fixes, ends end of September 2025. @google/genai Go google.golang.org/generative-ai All support, including bug fixes, ends end of September 2025. google.golang.org/genai Dart and Flutter google_generative_ai Not actively maintained Use trusted community or third party libraries, like firebase_ai , or access using REST API Swift generative-ai-swift Not actively maintained Use Gemini in Firebase Android generative-ai-android Not actively maintained Use Gemini in Firebase Note for Java developers: There was no legacy Google-provided Java SDK for the Gemini API, so no migration from a previous Google library is required. You can start directly with the new library in the Language support and installation section. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google \ No newline at end of file diff --git a/docstore/3023e18b-1d4d-4af8-98a5-da5a7b0ef80d b/docstore/3023e18b-1d4d-4af8-98a5-da5a7b0ef80d new file mode 100644 index 0000000000000000000000000000000000000000..4ce25c45c0956235a2a76b8ce578fbaaad6010c8 --- /dev/null +++ b/docstore/3023e18b-1d4d-4af8-98a5-da5a7b0ef80d @@ -0,0 +1 @@ +clip" , ]), }); console . log ( response . text ); } await main (); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } defer client . DeleteFile ( ctx , file . Name ) model := client . GenerativeModel ( "gemini-2.0-flash" ) resp , err := model . GenerateContent ( ctx , genai . FileData { URI : file . URI }, genai . Text ( "Describe this audio clip" )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) REST AUDIO_PATH = "path/to/sample.mp3" MIME_TYPE = $( file -b --mime-type " ${ AUDIO_PATH } " ) NUM_BYTES = $( wc -c < " ${ AUDIO_PATH } " ) DISPLAY_NAME = AUDIO tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D " ${ tmp_header_file } " \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ AUDIO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Describe this audio clip"}, {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": \ No newline at end of file diff --git a/docstore/30350ba0-7530-4cc6-8deb-8bf89374e0e5 b/docstore/30350ba0-7530-4cc6-8deb-8bf89374e0e5 new file mode 100644 index 0000000000000000000000000000000000000000..96b87b83020710118b1637c95a4f4b11475a3569 --- /dev/null +++ b/docstore/30350ba0-7530-4cc6-8deb-8bf89374e0e5 @@ -0,0 +1 @@ +thinking models in general, on the Thinking page. Parallel function calling In addition to single turn function calling, you can also call multiple functions at once. Parallel function calling lets you execute multiple functions at once and is used when the functions are not dependent on each other. This is useful in scenarios like gathering data from multiple independent sources, such as retrieving customer details from different databases or checking inventory levels across various warehouses or performing multiple actions such as converting your apartment into a disco. Python power_disco_ball = { "name" : "power_disco_ball" , "description" : "Powers the spinning disco ball." , "parameters" : { "type" : "object" , "properties" : { "power" : { "type" : "boolean" , "description" : "Whether to turn the disco ball on or off." , } }, "required" : [ "power" ], }, } start_music = { "name" : "start_music" , "description" : "Play some music matching the specified parameters." , "parameters" : { "type" : "object" , "properties" : { "energetic" : { "type" : "boolean" , "description" : "Whether the music is energetic or not." , }, "loud" : { "type" : "boolean" , "description" : "Whether the music is loud or not." , }, }, "required" : [ "energetic" , "loud" ], }, } dim_lights = { "name" : "dim_lights" , "description" : "Dim the lights." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "number" , "description" : "The brightness of the lights, 0.0 is off, 1.0 is full." , } }, "required" : [ "brightness" ], }, } JavaScript import { Type } from '@google/genai' ; const powerDiscoBall = { name : 'power_disco_ball' , description : 'Powers the spinning disco ball.' , parameters : { type : Type . OBJECT , properties : { power : { type : Type . BOOLEAN , description : 'Whether to turn the disco ball on or off.' } }, required : [ 'power' ] } }; const startMusic = { name : 'start_music' , description : 'Play some music matching the specified parameters.' , \ No newline at end of file diff --git a/docstore/30508cb4-e019-45a2-b2b3-3a6daf73d1c2 b/docstore/30508cb4-e019-45a2-b2b3-3a6daf73d1c2 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/30508cb4-e019-45a2-b2b3-3a6daf73d1c2 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/3063ae38-cedf-4073-8fcf-8930b3f9e827 b/docstore/3063ae38-cedf-4073-8fcf-8930b3f9e827 new file mode 100644 index 0000000000000000000000000000000000000000..bebef94a2670f1eebe768f4cb4680db515bfe743 --- /dev/null +++ b/docstore/3063ae38-cedf-4073-8fcf-8930b3f9e827 @@ -0,0 +1 @@ +it's recommended to provide a single message summary to free up the context window for subsequent interactions. See Session Resumption for another method for loading session context. Sending and receiving audio The most common audio example, audio-to-audio , is covered in the Getting started guide. Here's an audio-to-text example that reads a WAV file, sends it in the correct format and receives text output: Python # Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav # Install helpers for converting files: pip install librosa soundfile import asyncio import io from pathlib import Path from google import genai from google.genai import types import soundfile as sf import librosa client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : buffer = io . BytesIO () y , sr = librosa . load ( "sample.wav" , sr = 16000 ) sf . write ( buffer , y , sr , format = 'RAW' , subtype = 'PCM_16' ) buffer . seek ( 0 ) audio_bytes = buffer . read () # If already in correct format, you can use this: # audio_bytes = Path("sample.pcm").read_bytes() await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav // Install helpers for converting files: npm install wavefile import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue \ No newline at end of file diff --git a/docstore/30660490-f031-49de-adeb-5538f2a94a4e b/docstore/30660490-f031-49de-adeb-5538f2a94a4e new file mode 100644 index 0000000000000000000000000000000000000000..29805a750d326aab08740367fc13678c1846ec09 --- /dev/null +++ b/docstore/30660490-f031-49de-adeb-5538f2a94a4e @@ -0,0 +1 @@ +default. Here, you disable it. from google import genai from google.genai import types client = genai . Client () def get_current_weather ( location : str ) - > str : """Get the current whether in a given location. Args: location: required, The city and state, e.g. San Franciso, CA unit: celsius or fahrenheit """ print ( f 'Called with: { location =} ' ) return "23C" response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = "What is the weather like in Boston?" , config = types . GenerateContentConfig ( tools = [ get_current_weather ], automatic_function_calling = { 'disable' : True }, ), ) function_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call Automatic function calling Before Python The old SDK only supports automatic function calling in chat. In the new SDK this is the default behavior in generate_content . import google.generativeai as genai def get_current_weather ( city : str ) - > str : return "23C" model = genai . GenerativeModel ( model_name = "gemini-1.5-flash" , tools = [ get_current_weather ] ) chat = model . start_chat ( enable_automatic_function_calling = True ) result = chat . send_message ( "What is the weather in San Francisco?" ) After Python from google import genai from google.genai import types client = genai . Client () def get_current_weather ( city : str ) - > str : return "23C" response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = "What is the weather like in Boston?" , config = types . GenerateContentConfig ( tools = [ get_current_weather ] ), ) Code execution Code execution is a tool that allows the model to generate Python code, run it, and return the result. Before Python import google.generativeai as genai model = genai . GenerativeModel ( model_name = "gemini-1.5-flash" , tools = "code_execution" ) result = model . generate_content ( "What is the sum of the first 50 prime numbers? Generate and run code for " "the calculation, and make sure you \ No newline at end of file diff --git a/docstore/308856d0-c1b3-419d-b9e0-e53b1c24f120 b/docstore/308856d0-c1b3-419d-b9e0-e53b1c24f120 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/308856d0-c1b3-419d-b9e0-e53b1c24f120 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/309b94b5-1b9c-48a4-a1a6-572e87e8ea35 b/docstore/309b94b5-1b9c-48a4-a1a6-572e87e8ea35 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/309b94b5-1b9c-48a4-a1a6-572e87e8ea35 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/30a260ec-d6a5-463a-82d4-935553702847 b/docstore/30a260ec-d6a5-463a-82d4-935553702847 new file mode 100644 index 0000000000000000000000000000000000000000..4ce25c45c0956235a2a76b8ce578fbaaad6010c8 --- /dev/null +++ b/docstore/30a260ec-d6a5-463a-82d4-935553702847 @@ -0,0 +1 @@ +clip" , ]), }); console . log ( response . text ); } await main (); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } defer client . DeleteFile ( ctx , file . Name ) model := client . GenerativeModel ( "gemini-2.0-flash" ) resp , err := model . GenerateContent ( ctx , genai . FileData { URI : file . URI }, genai . Text ( "Describe this audio clip" )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) REST AUDIO_PATH = "path/to/sample.mp3" MIME_TYPE = $( file -b --mime-type " ${ AUDIO_PATH } " ) NUM_BYTES = $( wc -c < " ${ AUDIO_PATH } " ) DISPLAY_NAME = AUDIO tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D " ${ tmp_header_file } " \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ AUDIO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Describe this audio clip"}, {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": \ No newline at end of file diff --git a/docstore/30ae347b-da4e-4c0a-98e8-766de8ff9eaf b/docstore/30ae347b-da4e-4c0a-98e8-766de8ff9eaf new file mode 100644 index 0000000000000000000000000000000000000000..a3f65441166ab1d39d6f723c5ce4c04b54ac95e6 --- /dev/null +++ b/docstore/30ae347b-da4e-4c0a-98e8-766de8ff9eaf @@ -0,0 +1 @@ +values, use an enum. Tool Selection: While the model can use an arbitrary number of tools, providing too many can increase the risk of selecting an incorrect or suboptimal tool. For best results, aim to provide only the relevant tools for the context or task, ideally keeping the active set to a maximum of 10-20. Consider dynamic tool selection based on conversation context if you have a large total number of tools. Prompt Engineering: Provide context: Tell the model its role (e.g., "You are a helpful weather assistant."). Give instructions: Specify how and when to use functions (e.g., "Don't guess dates; always use a future date for forecasts."). Encourage clarification: Instruct the model to ask clarifying questions if needed. Temperature: Use a low temperature (e.g., 0) for more deterministic and reliable function calls. Validation: If a function call has significant consequences (e.g., placing an order), validate the call with the user before executing it. Error Handling : Implement robust error handling in your functions to gracefully handle unexpected inputs or API failures. Return informative error messages that the model can use to generate helpful responses to the user. Security: Be mindful of security when calling external APIs. Use appropriate authentication and authorization mechanisms. Avoid exposing sensitive data in function calls. Token Limits: Function descriptions and parameters count towards your input token limit. If you're hitting token limits, consider limiting the number of functions or the length of the descriptions, break down complex tasks into smaller, more focused function sets. Notes and limitations Only a subset of the OpenAPI schema is supported. Supported parameter types in Python are limited. Automatic function calling is a Python SDK feature only. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License \ No newline at end of file diff --git a/docstore/30c21b8a-a2a5-4e3a-9fb1-9ae853f901e4 b/docstore/30c21b8a-a2a5-4e3a-9fb1-9ae853f901e4 new file mode 100644 index 0000000000000000000000000000000000000000..e652ebcdf342b29a27305f6af4427b0dbb03d3f1 --- /dev/null +++ b/docstore/30c21b8a-a2a5-4e3a-9fb1-9ae853f901e4 @@ -0,0 +1 @@ +"gemini-2.5-flash" , contents = "What's the temperature in London?" , config = config , ) # Check for a function call if response . candidates [ 0 ] . content . parts [ 0 ] . function_call : function_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call print ( f "Function to call: { function_call . name } " ) print ( f "Arguments: { function_call . args } " ) # In a real app, you would call your function here: # result = get_current_temperature(**function_call.args) else : print ( "No function call found in the response." ) print ( response . text ) JavaScript import { GoogleGenAI , Type } from '@google/genai' ; // Configure the client const ai = new GoogleGenAI ({}); // Define the function declaration for the model const weatherFunctionDeclaration = { name : 'get_current_temperature' , description : 'Gets the current temperature for a given location.' , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , description : 'The city name, e.g. San Francisco' , }, }, required : [ 'location' ], }, }; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : "What's the temperature in London?" , config : { tools : [{ functionDeclarations : [ weatherFunctionDeclaration ] }], }, }); // Check for function calls in the response if ( response . functionCalls && response . functionCalls . length > 0 ) { const functionCall = response . functionCalls [ 0 ]; // Assuming one function call console . log ( `Function to call: ${ functionCall . name } ` ); console . log ( `Arguments: ${ JSON . stringify ( functionCall . args ) } ` ); // In a real app, you would call your actual function here: // const result = await getCurrentTemperature(functionCall.args); } else { console . log ( "No function call found in the response." ); console . log ( response . text ); } REST curl \ No newline at end of file diff --git a/docstore/30c7eac9-a6a6-494e-bd1a-c4334ce2e00e b/docstore/30c7eac9-a6a6-494e-bd1a-c4334ce2e00e new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/30c7eac9-a6a6-494e-bd1a-c4334ce2e00e @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/30e0543a-080e-4ac3-8096-ea1647041bcc b/docstore/30e0543a-080e-4ac3-8096-ea1647041bcc new file mode 100644 index 0000000000000000000000000000000000000000..7c01a8e52c6f77d499351d1468f50bccd1328e77 --- /dev/null +++ b/docstore/30e0543a-080e-4ac3-8096-ea1647041bcc @@ -0,0 +1 @@ +function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); // Combine audio data strings and save as wave file const combinedAudio = turns . reduce (( acc , turn ) = > { if ( turn . data ) { const buffer = Buffer . from ( turn . data , 'base64' ); const intArray = new Int16Array ( buffer . buffer , buffer . byteOffset , buffer . byteLength / Int16Array . BYTES_PER_ELEMENT ); return acc . concat ( Array . from ( intArray )); } return acc ; }, []); const audioBuffer = new Int16Array ( combinedAudio ); const wf = new WaveFile (); wf . fromScratch ( 1 , 24000 , '16' , audioBuffer ); fs . writeFileSync ( 'output.wav' , wf . toBuffer ()); session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Audio formats Audio data in the Live API is always raw, little-endian, 16-bit PCM. Audio output always uses a sample rate of 24kHz. Input audio is natively 16kHz, but the Live API will resample if needed so any sample rate can be sent. To convey the sample rate of input audio, set the MIME type of each audio-containing Blob to a value like audio/pcm;rate=16000 . Audio transcriptions You can enable transcription of the model's audio output by sending output_audio_transcription in the setup config. The transcription language is inferred from the model's response. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ], "output_audio_transcription" : {} } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello? Gemini are you there?" await session . send_client_content ( turns = { \ No newline at end of file diff --git a/docstore/30e2d082-6a2c-44ed-8665-9ef271d8c707 b/docstore/30e2d082-6a2c-44ed-8665-9ef271d8c707 new file mode 100644 index 0000000000000000000000000000000000000000..ebd105342549a255faf01232b49ba70d20b000ef --- /dev/null +++ b/docstore/30e2d082-6a2c-44ed-8665-9ef271d8c707 @@ -0,0 +1 @@ +the next tier. Why use the paid tier? When you enable billing and use the paid tier, you benefit from higher rate limits , and your prompts and responses aren't used to improve Google products. For more information on data use for paid services, see the terms of service . Cloud Billing The Gemini API uses Cloud Billing for billing services. To use the paid tier, you must set up Cloud Billing on your cloud project. After you've enabled Cloud Billing, you can use Cloud Billing tools to track spending, understand costs, make payments, and access Cloud Billing support. Enable billing You can enable Cloud Billing starting from Google AI Studio: Open Google AI Studio . In the bottom of the left sidebar, select Settings > Plan information . Click Set up Billing for your chosen project to enable Cloud Billing. Monitor usage After you enable Cloud Billing, you can monitor your usage of the Gemini API in the Google Cloud console . The service name for the API is generativelanguage.googleapis.com , and in the console the Gemini API is also referred to as the Generative Language API . To learn more, see the Google Cloud documentation on monitoring API usage . Frequently asked questions This section provides answers to frequently asked questions. What am I billed for? Gemini API pricing is based on the following: Input token count Output token count Cached token count Cached token storage duration For pricing information, see the pricing page . Where can I view my quota? You can view your quota and system limits in the Google Cloud console . How do I request more quota? To request more quota, follow the instructions at How to request an upgrade . Can I use the Gemini API for free in EEA (including EU), the UK, and CH? Yes, we make the free tier and paid tier available in many regions . If I set up billing with the Gemini API, will I be charged for my Google AI Studio usage? No, Google AI Studio usage remains free of charge regardless of if you set up billing across all supported \ No newline at end of file diff --git a/docstore/30ed0438-ab66-419c-b185-272277dce226 b/docstore/30ed0438-ab66-419c-b185-272277dce226 new file mode 100644 index 0000000000000000000000000000000000000000..097e48b20f2cbfa1b05db2a0f80e7f3c1583707a --- /dev/null +++ b/docstore/30ed0438-ab66-419c-b185-272277dce226 @@ -0,0 +1 @@ +Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Gemini Developer API Get a Gemini API Key Get a Gemini API key and make your first API request in minutes. Python from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Explain how AI works in a few words" , ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works in a few words" , }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil )) if err != nil { log . Fatal ( err ) } result , err := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) if err != nil { log . Fatal ( err ) } fmt . Println ( result . Text ()) } Java package com.example ; import com.google.genai.Client ; import com.google.genai.types.GenerateContentResponse ; public class GenerateTextFromTextInput { public static void main ( String [] args ) { Client client = new Client (); GenerateContentResponse response = client . models . generateContent ( "gemini-2.5-flash" , "Explain how AI works in a few words" , null ); System . out . println ( response . text ()); } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H \ No newline at end of file diff --git a/docstore/30fb9539-db32-4c37-af47-1acef3815fd8 b/docstore/30fb9539-db32-4c37-af47-1acef3815fd8 new file mode 100644 index 0000000000000000000000000000000000000000..a1daa64b7091fa9e5594d9222d90d8c8ad6521d2 --- /dev/null +++ b/docstore/30fb9539-db32-4c37-af47-1acef3815fd8 @@ -0,0 +1 @@ +). You can listen to all the voices in AI Studio . To specify a voice, set the voice name within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "voice_config" : { "prebuilt_voice_config" : { "voice_name" : "Kore" }} }, } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" } } } }; Note: If you're using the generateContent API, the set of available voices is slightly different. See the audio generation guide for generateContent audio generation voices. The Live API supports multiple languages . To change the language, set the language code within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "language_code" : "de-DE" } } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { languageCode : "de-DE" } }; Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Native audio capabilities The following capabilities are only available with native audio. You can learn more about native audio in Choose a model and audio generation . Note: Native audio models currently have limited tool use support. See Overview of supported tools for details. How to use native audio output To use native audio output, configure one of the native audio models and set response_modalities to AUDIO . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-preview-native-audio-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio . live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-preview-native-audio-dialog' ; const config = { responseModalities : [ \ No newline at end of file diff --git a/docstore/3107701f-85cb-4349-b093-75af0efe4f8d b/docstore/3107701f-85cb-4349-b093-75af0efe4f8d new file mode 100644 index 0000000000000000000000000000000000000000..34fafa88bef1190b729bdf255b8c99cfcd7b08b1 --- /dev/null +++ b/docstore/3107701f-85cb-4349-b093-75af0efe4f8d @@ -0,0 +1 @@ +Use descriptive language : Use adjectives and adverbs to paint a clear picture for Veo. Provide context : If necessary, include background information to help your model understand what you want. Reference specific artistic styles : If you have a particular aesthetic in mind, reference specific artistic styles or art movements. Utilize prompt engineering tools : Consider exploring prompt engineering tools or resources to help you refine your prompts and achieve optimal results. For more information, visit Introduction to prompt design . Enhance the facial details in your personal and group images : Specify facial details as a focus of the photo like using the word portrait in the prompt. Example prompts and output This section presents several prompts, highlighting how descriptive details can elevate the outcome of each video. Icicles This video demonstrates how you can use the elements of prompt writing basics in your prompt. Prompt Generated output Close up shot (composition) of melting icicles (subject) on a frozen rock wall (context) with cool blue tones (ambiance), zoomed in (camera motion) maintaining close-up detail of water drips (action). Man on the phone These videos demonstrate how you can revise your prompt with increasingly specific details to get Veo to refine the output to your liking. Prompt Generated output Analysis The camera dollies to show a close up of a desperate man in a green trench coat. He's making a call on a rotary-style wall phone with a green neon light. It looks like a movie scene. This is the first generated video based on the prompt. A close-up cinematic shot follows a desperate man in a weathered green trench coat as he dials a rotary phone mounted on a gritty brick wall, bathed in the eerie glow of a green neon sign. The camera dollies in, revealing the tension in his jaw and the desperation etched on his face as he struggles to make the call. The shallow depth of field focuses on his furrowed brow and the black rotary phone, \ No newline at end of file diff --git a/docstore/311ca055-3dd6-4374-a55e-dd56edbd50bf b/docstore/311ca055-3dd6-4374-a55e-dd56edbd50bf new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/311ca055-3dd6-4374-a55e-dd56edbd50bf @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/3123d78b-8d8e-4a6d-bdc3-55bb6668d340 b/docstore/3123d78b-8d8e-4a6d-bdc3-55bb6668d340 new file mode 100644 index 0000000000000000000000000000000000000000..c72243f466a39b8c76a4237da0daa8b99ecbbe13 --- /dev/null +++ b/docstore/3123d78b-8d8e-4a6d-bdc3-55bb6668d340 @@ -0,0 +1 @@ +}, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST # Use a temporary file to hold the base64 encoded image data TEMP_B64 = $( mktemp ) trap 'rm -f "$TEMP_B64"' EXIT base64 $B64FLAGS $IMG_PATH > " $TEMP_B64 " # Use a temporary file to hold the JSON payload TEMP_JSON = $( mktemp ) trap 'rm -f "$TEMP_JSON"' EXIT cat > " $TEMP_JSON " << EOF { "contents" : [ { "parts" : [ { "text" : "Tell me about this instrument" } , { "inline_data" : { "mime_type" : "image/jpeg" , "data" : " $( cat " $TEMP_B64 " ) " } } ] } ] } EOF curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d "@ $TEMP_JSON " Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const imageUrl = 'http://image/url' ; const image = getImageData ( imageUrl ); const payload = { contents : [ { parts : [ { image }, { text : 'Tell me about this instrument' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } function getImageData ( url ) { const blob = UrlFetchApp . fetch ( url ). getBlob (); return { mimeType : blob . getContentType (), data : Utilities . base64Encode ( blob . getBytes ()) }; } \ No newline at end of file diff --git a/docstore/3137aa60-2fe0-401e-968f-9ccf7bff6b5e b/docstore/3137aa60-2fe0-401e-968f-9ccf7bff6b5e new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/3137aa60-2fe0-401e-968f-9ccf7bff6b5e @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/313876f5-5e03-42d2-b2b2-b8be7c61dc4f b/docstore/313876f5-5e03-42d2-b2b2-b8be7c61dc4f new file mode 100644 index 0000000000000000000000000000000000000000..c2369ca5049154f630fe926e06160c0364720f7c --- /dev/null +++ b/docstore/313876f5-5e03-42d2-b2b2-b8be7c61dc4f @@ -0,0 +1 @@ +const turns = await handleTurn (); // Combine audio data strings and save as wave file const combinedAudio = turns . reduce (( acc , turn ) = > { if ( turn . data ) { const buffer = Buffer . from ( turn . data , 'base64' ); const intArray = new Int16Array ( buffer . buffer , buffer . byteOffset , buffer . byteLength / Int16Array . BYTES_PER_ELEMENT ); return acc . concat ( Array . from ( intArray )); } return acc ; }, []); const audioBuffer = new Int16Array ( combinedAudio ); const wf = new WaveFile (); wf . fromScratch ( 1 , 24000 , '16' , audioBuffer ); // output is 24kHz fs . writeFileSync ( 'audio.wav' , wf . toBuffer ()); session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); What's next Read the full Live API Capabilities guide for key capabilities and configurations; including Voice Activity Detection and native audio features. Read the Tool use guide to learn how to integrate Live API with tools and function calling. Read the Session management guide for managing long running conversations. Read the Ephemeral tokens guide for secure authentication in client-to-server applications. For more information about the underlying WebSockets API, see the WebSockets API reference . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/3144ca08-6a7a-4f1f-add8-64e1fc332d3a b/docstore/3144ca08-6a7a-4f1f-add8-64e1fc332d3a new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/3144ca08-6a7a-4f1f-add8-64e1fc332d3a @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/31709eda-f21d-4f3c-92b9-87bc57a0ac95 b/docstore/31709eda-f21d-4f3c-92b9-87bc57a0ac95 new file mode 100644 index 0000000000000000000000000000000000000000..dd1226540612f04fb3f971567b47c61067071189 --- /dev/null +++ b/docstore/31709eda-f21d-4f3c-92b9-87bc57a0ac95 @@ -0,0 +1 @@ +This example shows you how to specify a subject description. Subject description Prompt Generated output The description can include a subject, or multiple subjects and actions. Here, our subject is "white concrete apartment building." An architectural rendering of a white concrete apartment building with flowing organic shapes, seamlessly blending with lush greenery and futuristic elements Context This example shows you how to specify context. Context Prompt Generated output The background or context in which the subject will be placed is very important. Try placing your subject in a variety of backgrounds like on a busy street, or in outer space. A satellite floating through outer space with the moon and some stars in the background. Action This example shows you how to specify action. Action Prompt Generated output What is the subject doing like walking, running, or turning their head. A wide shot of a woman walking along the beach, looking content and relaxed towards the horizon at sunset. Style This example shows you how to specify style. Style Prompt Generated output You can add keywords to improve generation quality and steer it closer to intended style, such as shallow depth of field, movie still, minimalistic, surreal, vintage, futuristic, or double-exposure. Film noir style, man and woman walk on the street, mystery, cinematic, black and white. Camera motion This example shows you how to specify camera motion. Camera motion Prompt Generated output Options for camera motion include POV shot, aerial view, tracking drone view, or tracking shot. A POV shot from a vintage car driving in the rain, Canada at night, cinematic. Composition This example shows you how to specify composition. Composition Prompt Generated output How the shot is framed (wide shot, close-up, low angle). Extreme close-up of a an eye with city reflected in it. Create a video of a wide shot of surfer walking on a beach with a surfboard, beautiful sunset, cinematic. Ambiance This example \ No newline at end of file diff --git a/docstore/3171ea2e-6211-4654-a4c4-883b78d39ed3 b/docstore/3171ea2e-6211-4654-a4c4-883b78d39ed3 new file mode 100644 index 0000000000000000000000000000000000000000..88c00154929a60afbed4badff3d232f79f564557 --- /dev/null +++ b/docstore/3171ea2e-6211-4654-a4c4-883b78d39ed3 @@ -0,0 +1 @@ +can take many forms including researching state of the art studies in your app domain, observing how people are using similar apps, or running a user study, survey, or conducting informal interviews with potential users. Advanced tips Speak with a diverse mix of prospective users within your target population about your application and its intended purpose so as to get a wider perspective on potential risks and to adjust diversity criteria as needed. The AI Risk Management Framework released by the U.S. government's National Institute of Standards and Technology (NIST) provides more detailed guidance and additional learning resources for AI risk management. DeepMind's publication on the ethical and social risks of harm from language models describes in detail the ways that language model applications can cause harm. Consider adjustments to mitigate safety risks Now that you have an understanding of the risks, you can decide how to mitigate them. Determining which risks to prioritize and how much you should do to try to prevent them is a critical decision, similar to triaging bugs in a software project. Once you've determined priorities, you can start thinking about the types of mitigations that would be most appropriate. Often simple changes can make a difference and reduce risks. For example, when designing an application consider: Tuning the model output to better reflect what is acceptable in your application context. Tuning can make the output of the model more predictable and consistent and therefore can help mitigate certain risks. Providing an input method that facilities safer outputs. The exact input you give to an LLM can make a difference in the quality of the output. Experimenting with input prompts to find what works most safely in your use-case is well worth the effort, as you can then provide a UX that facilitates it. For example, you could restrict users to choose only from a drop-down list of input prompts, or offer pop-up suggestions with \ No newline at end of file diff --git a/docstore/31a90d79-220f-49d8-8bc1-92cd50bcc063 b/docstore/31a90d79-220f-49d8-8bc1-92cd50bcc063 new file mode 100644 index 0000000000000000000000000000000000000000..fb06dda2e8483dd7dcefb64302c607622d7348ef --- /dev/null +++ b/docstore/31a90d79-220f-49d8-8bc1-92cd50bcc063 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/batch-mode#batch-job-status Title: Batch Mode | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/31eae1f2-cccc-4dd2-bb27-9b0c289a6a70 b/docstore/31eae1f2-cccc-4dd2-bb27-9b0c289a6a70 new file mode 100644 index 0000000000000000000000000000000000000000..024dbd78fa7ec55b1f91d59d77cedd6de4a8bd83 --- /dev/null +++ b/docstore/31eae1f2-cccc-4dd2-bb27-9b0c289a6a70 @@ -0,0 +1 @@ +requiring re-initiation of the provisioning process. Verify secure authentication for your own backend. Ephemeral tokens will only be as secure as your backend authentication method. Generally, avoid using ephemeral tokens for backend-to-Gemini connections, as this path is typically considered secure. Limitations Ephemeral tokens are only compatible with Live API at this time. What's next Read the Live API reference on ephemeral tokens for more information. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC. \ No newline at end of file diff --git a/docstore/31f4d7db-0dfd-4380-8ceb-7c651d0f3f35 b/docstore/31f4d7db-0dfd-4380-8ceb-7c651d0f3f35 new file mode 100644 index 0000000000000000000000000000000000000000..ca0d44e9961a5c85cb8aeb7b443a141c9784fb0d --- /dev/null +++ b/docstore/31f4d7db-0dfd-4380-8ceb-7c651d0f3f35 @@ -0,0 +1 @@ +Image understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image understanding Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models. Tip: In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer improved accuracy for specific use cases like object detection and segmentation , through additional training. See the Capabilities section for more details. Passing images to Gemini You can provide images as input to Gemini using two methods: Passing inline image data : Ideal for smaller files (total request size less than 20MB, including prompts). Uploading images using the File API : Recommended for larger files or for reusing images across multiple requests. Passing inline image data You can pass inline image data in the request to generateContent . You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language). The following example shows how to read an image from a local file and pass it to generateContent API for processing. Python from google.genai import types with open ( 'path/to/small-sample.jpg' , 'rb' ) as f : image_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ types . Part . from_bytes ( data = image_bytes , mime_type = 'image/jpeg' , ), 'Caption this image.' ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs \ No newline at end of file diff --git a/docstore/31f4fde4-0bca-49d3-ace3-06af0508ea86 b/docstore/31f4fde4-0bca-49d3-ace3-06af0508ea86 new file mode 100644 index 0000000000000000000000000000000000000000..5a67c041917cdaf904b0e03794a07af474503a9a --- /dev/null +++ b/docstore/31f4fde4-0bca-49d3-ace3-06af0508ea86 @@ -0,0 +1 @@ +upload..." curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D ${ tmp_header_file } \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " echo "Uploading video data..." curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ VIDEO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri echo "File uploaded successfully. File URI: ${ file_uri } " # --- 3. Generate content using the uploaded video file --- echo "Generating content from video..." curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Summarize this video. Then create a quiz with an answer key based on the information in this video."}] }] }' 2 > /dev/null > response.json jq -r ".candidates[].content.parts[].text" response.json To learn more about working with media files, see Files API . Pass video data inline Instead of uploading a video file using the File API, you can pass smaller videos directly in the request to generateContent . This is suitable for shorter videos under 20MB total request size. Here's an example of providing inline video data: Python # Only for videos of size <20Mb video_file_name = "/path/to/your/video.mp4" video_bytes = \ No newline at end of file diff --git a/docstore/31fe2f96-d3af-4a7e-91f1-645fcb3d1e43 b/docstore/31fe2f96-d3af-4a7e-91f1-645fcb3d1e43 new file mode 100644 index 0000000000000000000000000000000000000000..485847fd8e226bc46bd8d42c44cd3e8dd100fb7e --- /dev/null +++ b/docstore/31fe2f96-d3af-4a7e-91f1-645fcb3d1e43 @@ -0,0 +1 @@ +supported on the interactive (or non-batch mode) API. Pricing: Batch Mode usage is priced at 50% of the standard interactive API cost for the equivalent model. Service Level Objective (SLO): Batch jobs are designed to complete within a 24-hour turnaround time. Many jobs may complete much faster depending on their size and current system load. Caching: Context caching is enabled for batch requests. If a request in your batch results in a cache hit, the cached tokens are priced the same as for non-batch mode traffic. Best practices Use input files for large requests: For a large number of requests, always use the file input method for better manageability and to avoid hitting request size limits for the BatchGenerateContent call itself. Note that there's a the 2GB file size limit per input file. Error handling: Check the batchStats for failedRequestCount after a job completes. If using file output, parse each line to check if it's a GenerateContentResponse or a status object indicating an error for that specific request. Submit jobs once: The creation of a batch job is not idempotent. If you send the same creation request twice, two separate batch jobs will be created. Break up very large batches: While the target turnaround time is 24 hours, actual processing time can vary based on system load and job size. For large jobs, consider breaking them into smaller batches if intermediate results are needed sooner. What's next Check out the batch mode notebook for more examples. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/32012ae6-2b0c-4079-a38e-5d932f637a15 b/docstore/32012ae6-2b0c-4079-a38e-5d932f637a15 new file mode 100644 index 0000000000000000000000000000000000000000..d503fefb6e007ea7e18de8548faa948d780e8f08 --- /dev/null +++ b/docstore/32012ae6-2b0c-4079-a38e-5d932f637a15 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/long-context#main-content Title: Long context | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/32074c10-1ca0-4c01-9485-0d3386923361 b/docstore/32074c10-1ca0-4c01-9485-0d3386923361 new file mode 100644 index 0000000000000000000000000000000000000000..3d93e98c2388d03b02165eba8a61d6ed193d8415 --- /dev/null +++ b/docstore/32074c10-1ca0-4c01-9485-0d3386923361 @@ -0,0 +1 @@ +Prompt design strategies | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Prompt design strategies Prompt design is the process of creating prompts, or natural language requests, that elicit accurate, high quality responses from a language model. This page introduces basic concepts, strategies, and best practices to get you started designing prompts to get the most out of Gemini AI models. Topic-specific prompt guides Looking for more specific prompt strategies? Check out our other prompting guides on: Prompting with media files Prompting for image generation Prompting for video generation Google AI Studio also hosts a prompt gallery meant to interactively showcase many of the concepts shared in this guide. Clear and specific instructions An effective and efficient way to customize model behavior is to provide it with clear and specific instructions. Instructions can be in the form of a question, step-by-step tasks, or as complex as mapping out a user's experience and mindset. Input Input is the required text in the prompt that you want the model to provide a response to. Inputs can be a question that the model answers (question input), a task the model performs (task input), an entity the model operates on (entity input), or partial input that the model completes or continues (completion input). Input type Prompt Generated output Question What's a good name for a flower shop that specializes in selling bouquets of dried flowers? Create a list of 5 options with just the names. Here are 10 names for a flower shop specializing in dried flowers: 1. Everlasting Blooms 2. Dried & Delightful 3. The Petal Preserve 4. Whispers of Wildflowers \ No newline at end of file diff --git a/docstore/321ccf46-8c40-4a8e-83a6-3e69e6de09b8 b/docstore/321ccf46-8c40-4a8e-83a6-3e69e6de09b8 new file mode 100644 index 0000000000000000000000000000000000000000..e8472db3f1b37b3dc2d13dd06406d0e42fc75dfb --- /dev/null +++ b/docstore/321ccf46-8c40-4a8e-83a6-3e69e6de09b8 @@ -0,0 +1 @@ +costing 258 tokens. Tips and best practices Verify that images are correctly rotated. Use clear, non-blurry images. When using a single image with text, place the text prompt after the image part in the contents array. What's next This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources: Files API : Learn more about uploading and managing files for use with Gemini. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/321fc4a7-8eee-429e-a217-6b5dd75d0baa b/docstore/321fc4a7-8eee-429e-a217-6b5dd75d0baa new file mode 100644 index 0000000000000000000000000000000000000000..8ae6e7f8faf81825a858a0fb6935f6e2a3e8dc09 --- /dev/null +++ b/docstore/321fc4a7-8eee-429e-a217-6b5dd75d0baa @@ -0,0 +1 @@ +help with that, as I'm only a language model." If the model responds with a fallback response, try increasing the temperature. Things to avoid Avoid relying on models to generate factual information. Use with care on math and logic problems. Generative models under the hood This section aims to answer the question - Is there randomness in generative models' responses, or are they deterministic? The short answer - yes to both. When you prompt a generative model, a text response is generated in two stages. In the first stage, the generative model processes the input prompt and generates a probability distribution over possible tokens (words) that are likely to come next. For example, if you prompt with the input text "The dog jumped over the ... ", the generative model will produce an array of probable next words: [("fence", 0.77), ("ledge", 0.12), ("blanket", 0.03), ...] This process is deterministic; a generative model will produce this same distribution every time it's input the same prompt text. In the second stage, the generative model converts these distributions into actual text responses through one of several decoding strategies. A simple decoding strategy might select the most likely token at every timestep. This process would always be deterministic. However, you could instead choose to generate a response by randomly sampling over the distribution returned by the model. This process would be stochastic (random). Control the degree of randomness allowed in this decoding process by setting the temperature. A temperature of 0 means only the most likely tokens are selected, and there's no randomness. Conversely, a high temperature injects a high degree of randomness into the tokens selected by the model, leading to more unexpected, surprising model responses. Next steps Now that you have a deeper understanding of prompt design, try writing your own prompts using Google AI Studio . To learn about multimodal prompting, see Prompting with media files . To learn \ No newline at end of file diff --git a/docstore/322547c9-e879-421e-924f-0a7053019c81 b/docstore/322547c9-e879-421e-924f-0a7053019c81 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/322547c9-e879-421e-924f-0a7053019c81 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/322c4e31-34ee-4377-9827-5fab0d4eb078 b/docstore/322c4e31-34ee-4377-9827-5fab0d4eb078 new file mode 100644 index 0000000000000000000000000000000000000000..3c1b7a7f28e24faa230d65b7608a5733d7c64407 --- /dev/null +++ b/docstore/322c4e31-34ee-4377-9827-5fab0d4eb078 @@ -0,0 +1 @@ +single prompt by including multiple image Part objects in the contents array. These can be a mix of inline data (local files or URLs) and File API references. Python from google import genai from google.genai import types client = genai . Client () # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client . files . upload ( file = image1_path ) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open ( image2_path , 'rb' ) as f : img2_bytes = f . read () # Create the prompt with text and multiple images response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is different between these two images?" , uploaded_file , # Use the uploaded file reference types . Part . from_bytes ( data = img2_bytes , mime_type = 'image/png' ) ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); async function main () { // Upload the first image const image1_path = "path/to/image1.jpg" ; const uploadedFile = await ai . files . upload ({ file : image1_path , config : { mimeType : "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png" ; const base64Image2File = fs . readFileSync ( image2_path , { encoding : "base64" , }); // Create the prompt with text and multiple images const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ "What is different between these two images?" , createPartFromUri ( uploadedFile . uri , uploadedFile . mimeType ), { inlineData : { mimeType : "image/png" , data : base64Image2File , }, }, ]), }); console . log ( response . text ); } await main (); Go // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile , _ := client . Files . UploadFromPath ( ctx , image1Path , nil ) // Prepare the second image as inline \ No newline at end of file diff --git a/docstore/3244cf8c-16d6-4be0-8c4b-eb850b22033a b/docstore/3244cf8c-16d6-4be0-8c4b-eb850b22033a new file mode 100644 index 0000000000000000000000000000000000000000..1540f812946831d7975c774be202eaa83cf52504 --- /dev/null +++ b/docstore/3244cf8c-16d6-4be0-8c4b-eb850b22033a @@ -0,0 +1 @@ +"CiwBVKhc7nRyTi3HmggPD9iQiRc261f5jwuMdw3H/itDH0emsb9ZVo3Nwx9p6wpsAVSoXO5i8fDV4jBSBLoaWxB5zUdlGY6aIGp+I0oEnwRRSRQ1LOvrDlojEH8JE8HjiKXALdJrvNPiG+HY3GZEO8pZjEZtc3UoBUh7+SVyjK7Xolu7aRYYeUyzrCapoETWypER1jbrJXnFV23hCosBAVSoXO6oIPNJSmbuEDfGafOhuCSHkpr1yjTp35RXYqmCESzRzWf5+nFXLqncqeFo4ohoxbiYQVpVQbOZF81p8o9zg6xeRE7qMeOv+XN7enXGJ4/s3qNFQpfkSMqRdBITN1VpX7jyfEAjvxBNc7PDfDJZmEPY338ZIY5nFFcmzJSWjVrboFt2sMFv+A==" } ] , "role" : "model" } , "finishReason" : "STOP" , "index" : 0 } ] , # Remainder of response... You can confirm that you received a signature and see what a signature looks like using the following code: # Step 2: Call the model with function declarations # ...Generation config, Configure the client, and Define user prompt (No changes) # Send request with declarations (using a thinking model) response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents ) # See thought signatures for part in response . candidates [ 0 ] . content . parts : if part . thought_signature : print ( "Thought signature:" ) print ( part . thought_signature ) Returning signatures back to the server In order to return signatures back: You should return signatures along with their containing parts back to the server You shouldn't merge a part with a signature with another part which also contains a signature. The signature string is not concatenable You shouldn't merge one part with a signature with another part without a signature. This breaks the correct positioning of the thought represented by the signature. The code will remain the same as in Step 4 of the previous section. But in this case (as indicated in the comment below) you will return signatures to the model along with the result of the function execution so the model can incorporate the thoughts into its final response: Python # Step 4: Create user friendly response with function result and call the model again # ...Create a function response part (No change) # Append thought \ No newline at end of file diff --git a/docstore/326251cb-239e-4213-b555-4cd93c721394 b/docstore/326251cb-239e-4213-b555-4cd93c721394 new file mode 100644 index 0000000000000000000000000000000000000000..b6194c8105fe9f40c0d9a89b00594ca4d33e213c --- /dev/null +++ b/docstore/326251cb-239e-4213-b555-4cd93c721394 @@ -0,0 +1 @@ +While meeting the stated qualification criteria is generally sufficient for approval, in rare cases an upgrade request may be denied based on other factors identified during the review process. This system helps maintain the security and integrity of the Gemini API platform for all users. Standard API rate limits The following table lists the rate limits for all standard Gemini API calls. Free Tier Model RPM TPM RPD Gemini 2.5 Pro 5 250,000 100 Gemini 2.5 Flash 10 250,000 250 Gemini 2.5 Flash-Lite Preview 06-17 15 250,000 1,000 Gemini 2.5 Flash Preview TTS 3 10,000 15 Gemini 2.5 Pro Preview TTS -- -- -- Gemini 2.0 Flash 15 1,000,000 200 Gemini 2.0 Flash Preview Image Generation 10 200,000 100 Gemini 2.0 Flash-Lite 30 1,000,000 200 Imagen 3 -- -- -- Veo 2 -- -- -- Gemini 1.5 Flash (Deprecated) 15 250,000 50 Gemini 1.5 Flash-8B (Deprecated) 15 250,000 50 Gemini 1.5 Pro (Deprecated) -- -- -- Gemma 3 & 3n 30 15,000 14,400 Gemini Embedding Experimental 03-07 5 -- 100 Tier 1 Model RPM TPM RPD Gemini 2.5 Pro 150 2,000,000 1,000 Gemini 2.5 Flash 1,000 1,000,000 10,000 Gemini 2.5 Flash-Lite Preview 06-17 4,000 4,000,000 -- Gemini 2.5 Flash Preview TTS 10 10,000 100 Gemini 2.5 Pro Preview TTS 10 10,000 50 Gemini 2.0 Flash 2,000 4,000,000 -- Gemini 2.0 Flash Preview Image Generation 1,000 1,000,000 10,000 Gemini 2.0 Flash-Lite 4,000 4,000,000 -- Imagen 4 Standard 10 -- 70 Imagen 4 Ultra 5 -- 30 Imagen 3 20 -- -- Veo 2 2 videos per minute -- 50 videos per day Gemini 1.5 Flash (Deprecated) 2,000 4,000,000 -- Gemini 1.5 Flash-8B (Deprecated) 4,000 4,000,000 -- Gemini 1.5 Pro (Deprecated) 1,000 4,000,000 -- Gemma 3 & 3n 30 15,000 14,400 Gemini Embedding Experimental 03-07 10 -- 1,000 Tier 2 Model RPM TPM RPD Gemini 2.5 Pro 1,000 5,000,000 50,000 Gemini 2.5 Flash 2,000 3,000,000 100,000 Gemini 2.5 Flash-Lite Preview 06-17 10,000 10,000,000 100,000 Gemini 2.5 Flash Preview TTS 1,000 100,000 10,000 Gemini 2.5 Pro Preview TTS 100 25,000 1,000 Gemini 2.0 Flash 10,000 10,000,000 -- \ No newline at end of file diff --git a/docstore/32774cd6-4dbf-4c1f-83f5-bd5f18984013 b/docstore/32774cd6-4dbf-4c1f-83f5-bd5f18984013 new file mode 100644 index 0000000000000000000000000000000000000000..465170b0f695eefaa56d7f3871f4f3df2424da62 --- /dev/null +++ b/docstore/32774cd6-4dbf-4c1f-83f5-bd5f18984013 @@ -0,0 +1 @@ +brightness color_temp: Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. Returns: A dictionary containing the set brightness and color temperature. """ return { "brightness" : brightness , "colorTemperature" : color_temp } JavaScript import { Type } from '@google/genai' ; // Define a function that the model can call to control smart lights const setLightValuesFunctionDeclaration = { name : 'set_light_values' , description : 'Sets the brightness and color temperature of a light.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'Light level from 0 to 100. Zero is off and 100 is full brightness' , }, color_temp : { type : Type . STRING , enum : [ 'daylight' , 'cool' , 'warm' ], description : 'Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`.' , }, }, required : [ 'brightness' , 'color_temp' ], }, }; /** * Set the brightness and color temperature of a room light. (mock API) * @param {number} brightness - Light level from 0 to 100. Zero is off and 100 is full brightness * @param {string} color_temp - Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. * @return {Object} A dictionary containing the set brightness and color temperature. */ function setLightValues ( brightness , color_temp ) { return { brightness : brightness , colorTemperature : color_temp }; } Step 2: Call the model with function declarations Once you have defined your function declarations, you can prompt the model to use them. It analyzes the prompt and function declarations and decides whether to respond directly or to call a function. If a function is called, the response object will contain a function call suggestion. Python from google.genai import types # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ set_light_values_declaration ]) config = types . GenerateContentConfig ( tools = [ \ No newline at end of file diff --git a/docstore/3287dd11-5f7c-4c54-bde1-e00e8aa65e83 b/docstore/3287dd11-5f7c-4c54-bde1-e00e8aa65e83 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/3287dd11-5f7c-4c54-bde1-e00e8aa65e83 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/32b54110-0cb6-4b45-a049-d5ca8b1e6573 b/docstore/32b54110-0cb6-4b45-a049-d5ca8b1e6573 new file mode 100644 index 0000000000000000000000000000000000000000..b73659061f0ce2830a1e6cf67f6a74b5cc699bc6 --- /dev/null +++ b/docstore/32b54110-0cb6-4b45-a049-d5ca8b1e6573 @@ -0,0 +1 @@ +"turn_off_the_lights" } tools = [{ "function_declarations" : [ turn_on_the_lights , turn_off_the_lights ]}] config = { "response_modalities" : [ "TEXT" ], "tools" : tools } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : prompt = "Turn on the lights please" await session . send_client_content ( turns = { "parts" : [{ "text" : prompt }]}) async for chunk in session . receive (): if chunk . server_content : if chunk . text is not None : print ( chunk . text ) elif chunk . tool_call : function_responses = [] for fc in chunk . tool_call . function_calls : function_response = types . FunctionResponse ( id = fc . id , name = fc . name , response = { "result" : "ok" } # simple, hard-coded function response ) function_responses . append ( function_response ) await session . send_tool_response ( function_responses = function_responses ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; // Simple function definitions const turn_on_the_lights = { name : "turn_on_the_lights" } // , description: '...', parameters: { ... } const turn_off_the_lights = { name : "turn_off_the_lights" } const tools = [{ functionDeclarations : [ turn_on_the_lights , turn_off_the_lights ] }] const config = { responseModalities : [ Modality . TEXT ], tools : tools } async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { \ No newline at end of file diff --git a/docstore/32c07a5c-12ba-45de-a409-3fb73afbe6fe b/docstore/32c07a5c-12ba-45de-a409-3fb73afbe6fe new file mode 100644 index 0000000000000000000000000000000000000000..3c1b7a7f28e24faa230d65b7608a5733d7c64407 --- /dev/null +++ b/docstore/32c07a5c-12ba-45de-a409-3fb73afbe6fe @@ -0,0 +1 @@ +single prompt by including multiple image Part objects in the contents array. These can be a mix of inline data (local files or URLs) and File API references. Python from google import genai from google.genai import types client = genai . Client () # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client . files . upload ( file = image1_path ) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open ( image2_path , 'rb' ) as f : img2_bytes = f . read () # Create the prompt with text and multiple images response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is different between these two images?" , uploaded_file , # Use the uploaded file reference types . Part . from_bytes ( data = img2_bytes , mime_type = 'image/png' ) ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); async function main () { // Upload the first image const image1_path = "path/to/image1.jpg" ; const uploadedFile = await ai . files . upload ({ file : image1_path , config : { mimeType : "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png" ; const base64Image2File = fs . readFileSync ( image2_path , { encoding : "base64" , }); // Create the prompt with text and multiple images const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ "What is different between these two images?" , createPartFromUri ( uploadedFile . uri , uploadedFile . mimeType ), { inlineData : { mimeType : "image/png" , data : base64Image2File , }, }, ]), }); console . log ( response . text ); } await main (); Go // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile , _ := client . Files . UploadFromPath ( ctx , image1Path , nil ) // Prepare the second image as inline \ No newline at end of file diff --git a/docstore/32dd7082-5327-4963-a857-e94e43325c3e b/docstore/32dd7082-5327-4963-a857-e94e43325c3e new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/32dd7082-5327-4963-a857-e94e43325c3e @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/32e7bd4b-67df-4e50-a8de-6ac1ccdc322a b/docstore/32e7bd4b-67df-4e50-a8de-6ac1ccdc322a new file mode 100644 index 0000000000000000000000000000000000000000..5b31a2c588785b0dc19769f45b0589a09f2843d3 --- /dev/null +++ b/docstore/32e7bd4b-67df-4e50-a8de-6ac1ccdc322a @@ -0,0 +1 @@ +world knowledge and reasoning. Seamlessly blending text and images is important. You want accurate visuals embedded within long text sequences. You want to edit images conversationally while maintaining context. Choose Imagen when: Image quality, photorealism, artistic detail, or specific styles (e.g., impressionism, anime) are top priorities. Performing specialized editing tasks like product background updates or image upscaling. Infusing branding, style, or generating logos and product designs. Imagen 4 should be your go-to model starting to generate images with Imagen. Choose Imagen 4 Ultra for advanced use-cases or when you need the best image quality. Note that Imagen 4 Ultra can only generate one image at a time. Imagen prompt guide This section of the Imagen guide shows you how modifying a text-to-image prompt can produce different results, along with examples of images you can create. Prompt writing basics Note: Maximum prompt length is 480 tokens. A good prompt is descriptive and clear, and makes use of meaningful keywords and modifiers. Start by thinking of your subject , context , and style . Image text: A sketch ( style ) of a modern apartment building ( subject ) surrounded by skyscrapers ( context and background ). Subject : The first thing to think about with any prompt is the subject : the object, person, animal, or scenery you want an image of. Context and background: Just as important is the background or context in which the subject will be placed. Try placing your subject in a variety of backgrounds. For example, a studio with a white background, outdoors, or indoor environments. Style: Finally, add the style of image you want. Styles can be general (painting, photograph, sketches) or very specific (pastel painting, charcoal drawing, isometric 3D). You can also combine styles. After you write a first version of your prompt, refine your prompt by adding more details until you get to the image that you want. Iteration is important. Start by \ No newline at end of file diff --git a/docstore/32fed38d-ebb5-417c-a686-4f39afe8cbb0 b/docstore/32fed38d-ebb5-417c-a686-4f39afe8cbb0 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/32fed38d-ebb5-417c-a686-4f39afe8cbb0 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/3300a928-dd1f-4647-9b60-8b27fbb86ed1 b/docstore/3300a928-dd1f-4647-9b60-8b27fbb86ed1 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/3300a928-dd1f-4647-9b60-8b27fbb86ed1 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/330a9df3-54e3-4a3e-8835-68e40e2c9a3f b/docstore/330a9df3-54e3-4a3e-8835-68e40e2c9a3f new file mode 100644 index 0000000000000000000000000000000000000000..2bc9ee1b64943d2fc9ee4b66d281a35e0e278a02 --- /dev/null +++ b/docstore/330a9df3-54e3-4a3e-8835-68e40e2c9a3f @@ -0,0 +1 @@ +Session management with Live API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Session management with Live API In the Live API, a session refers to a persistent connection where input and output are streamed continuously over the same connection (read more about how it works ). This unique session design enables low latency and supports unique features, but can also introduce challenges, like session time limits, and early termination. This guide covers strategies for overcoming the session management challenges that can arise when using the Live API. Session lifetime Without compression, audio-only sessions are limited to 15 minutes, and audio-video sessions are limited to 2 minutes. Exceeding these limits will terminate the session (and therefore, the connection), but you can use context window compression to extend sessions to an unlimited amount of time. The lifetime of a connection is limited as well, to around 10 minutes. When the connection terminates, the session terminates as well. In this case, you can configure a single session to stay active over multiple connections using session resumption . You'll also receive a GoAway message before the connection ends, allowing you to take further actions. Context window compression To enable longer sessions, and avoid abrupt connection termination, you can enable context window compression by setting the contextWindowCompression field as part of the session configuration. In the ContextWindowCompressionConfig , you can configure a sliding-window mechanism and the number of tokens that triggers compression. Python from google.genai import types config = types . LiveConnectConfig ( \ No newline at end of file diff --git a/docstore/3365f6de-77c2-4fbc-adc9-c07656c47b7e b/docstore/3365f6de-77c2-4fbc-adc9-c07656c47b7e new file mode 100644 index 0000000000000000000000000000000000000000..c25aa08b43110ad03ba57bdce48865495fdfb941 --- /dev/null +++ b/docstore/3365f6de-77c2-4fbc-adc9-c07656c47b7e @@ -0,0 +1 @@ +fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ], "generationConfig": { "thinkingConfig": { "thinkingBudget": 0 } } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } System instructions and other configurations You can guide the behavior of Gemini models with system instructions. To do so, pass a GenerateContentConfig object. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , config = types . GenerateContentConfig ( system_instruction = "You are a cat. Your name is Neko." ), contents = "Hello there" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Hello there" , config : { systemInstruction : "You are a cat. Your name is Neko." , }, }); console . log ( response . text ); } await main (); \ No newline at end of file diff --git a/docstore/339bba42-663c-4351-9ba8-0839c35e3bb9 b/docstore/339bba42-663c-4351-9ba8-0839c35e3bb9 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/339bba42-663c-4351-9ba8-0839c35e3bb9 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/33a6461e-40c2-42d6-aa14-09145229da35 b/docstore/33a6461e-40c2-42d6-aa14-09145229da35 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/33a6461e-40c2-42d6-aa14-09145229da35 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/33c47075-cc97-4adb-96bb-4ba2bc9dffeb b/docstore/33c47075-cc97-4adb-96bb-4ba2bc9dffeb new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/33c47075-cc97-4adb-96bb-4ba2bc9dffeb @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/33d29cfa-231f-4a36-bd52-ad3dab0aae4b b/docstore/33d29cfa-231f-4a36-bd52-ad3dab0aae4b new file mode 100644 index 0000000000000000000000000000000000000000..a76efec9a9a3e7390e77e9a866cc227646391c5b --- /dev/null +++ b/docstore/33d29cfa-231f-4a36-bd52-ad3dab0aae4b @@ -0,0 +1 @@ +Billing | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Billing This guide provides an overview of different Gemini API billing options, explains how to enable billing and monitor usage, and provides answers to frequently asked questions (FAQs) about billing. Upgrade to the Gemini API paid tier About billing Billing for the Gemini API is based on two pricing tiers: free of charge (or free ) and pay-as-you-go (or paid ). Pricing and rate limits differ between these tiers and also vary by model. You can check out the rate limits and pricing pages for more into. For a model-by-model breakdown of capabilities, see the Gemini models page . How to request an upgrade To transition from the free tier to the pay-as-you-go plan, you need to enable billing for your Google Cloud project. The button you see in Google AI Studio depends on your project's current plan. If you're on the free tier, you'll see a Set up Billing button for your project. If you're already on the paid tier and meet the criteria for a plan change, you might see an Upgrade button. To start the process, follow these steps: Go to the AI Studio API keys page . Find the project you want to move to the paid plan and click either Set up Billing or Upgrade , depending on the button displayed. The next step depends on the button you clicked: If you clicked Set up Billing: You'll be redirected to the Google Cloud console to link a billing account to your project. Follow the on-screen instructions to complete the process. If you clicked Upgrade: The system will automatically verify your project's eligibility. If your project meets all the requirements, it will be instantly upgraded to \ No newline at end of file diff --git a/docstore/33e15401-0944-4d9f-a9de-b7ac67160d09 b/docstore/33e15401-0944-4d9f-a9de-b7ac67160d09 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/33e15401-0944-4d9f-a9de-b7ac67160d09 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/33f75103-93bb-4257-9ac0-8da9e11fb9ec b/docstore/33f75103-93bb-4257-9ac0-8da9e11fb9ec new file mode 100644 index 0000000000000000000000000000000000000000..9bdceff10febb2c20e655c9c51c8df9e3aef858d --- /dev/null +++ b/docstore/33f75103-93bb-4257-9ac0-8da9e11fb9ec @@ -0,0 +1 @@ +Using different words or phrasing in your prompts often yields different responses from the model even though they all mean the same thing. If you're not getting the expected results from your prompt, try rephrasing it. Version 1: How do I bake a pie? Version 2: Suggest a recipe for a pie. Version 3: What's a good pie recipe? Switch to an analogous task: If you can't get the model to follow your instructions for a task, try giving it instructions for an analogous task that achieves the same result. This prompt tells the model to categorize a book by using predefined categories: Prompt: Which category does The Odyssey belong to: thriller sci-fi mythology biography Response: The Odyssey belongs to the category of **mythology**. Here's why: * **Mythology:** The Odyssey tells the story of Odysseus, a hero from Greek mythology, and his journey home after the Trojan War. It features gods, monsters, and supernatural events common to Greek mythology. ..... (gemini-2.5-flash) The response is correct, but the model didn't stay within the bounds of the options. You also want to model to just respond with one of the options instead of in a full sentence. In this case, you can rephrase the instructions as a multiple choice question and ask the model to choose an option. Prompt: Multiple choice problem: Which of the following options describes the book The Odyssey? Options: thriller sci-fi mythology biography Response: The correct answer is mythology . (gemini-2.5-flash) Change the order of prompt content: The order of the content in the prompt can sometimes affect the response. Try changing the content order and see how that affects the response. Version 1 : [ examples ] [ context ] [ input ] Version 2 : [ input ] [ examples ] [ context ] Version 3 : [ examples ] [ input ] [ context ] Fallback responses A fallback response is a response returned by the model when either the prompt or the response triggers a safety filter. An example of a fallback response is "I'm not able to \ No newline at end of file diff --git a/docstore/33febf34-b75c-4469-bedb-79c6582b307f b/docstore/33febf34-b75c-4469-bedb-79c6582b307f new file mode 100644 index 0000000000000000000000000000000000000000..c72243f466a39b8c76a4237da0daa8b99ecbbe13 --- /dev/null +++ b/docstore/33febf34-b75c-4469-bedb-79c6582b307f @@ -0,0 +1 @@ +}, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST # Use a temporary file to hold the base64 encoded image data TEMP_B64 = $( mktemp ) trap 'rm -f "$TEMP_B64"' EXIT base64 $B64FLAGS $IMG_PATH > " $TEMP_B64 " # Use a temporary file to hold the JSON payload TEMP_JSON = $( mktemp ) trap 'rm -f "$TEMP_JSON"' EXIT cat > " $TEMP_JSON " << EOF { "contents" : [ { "parts" : [ { "text" : "Tell me about this instrument" } , { "inline_data" : { "mime_type" : "image/jpeg" , "data" : " $( cat " $TEMP_B64 " ) " } } ] } ] } EOF curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d "@ $TEMP_JSON " Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const imageUrl = 'http://image/url' ; const image = getImageData ( imageUrl ); const payload = { contents : [ { parts : [ { image }, { text : 'Tell me about this instrument' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } function getImageData ( url ) { const blob = UrlFetchApp . fetch ( url ). getBlob (); return { mimeType : blob . getContentType (), data : Utilities . base64Encode ( blob . getBytes ()) }; } \ No newline at end of file diff --git a/docstore/3401c554-746d-46cd-88ee-44f7945262ea b/docstore/3401c554-746d-46cd-88ee-44f7945262ea new file mode 100644 index 0000000000000000000000000000000000000000..e7df3841ca47d093238d5ecfeb9f7cc95942f8e5 --- /dev/null +++ b/docstore/3401c554-746d-46cd-88ee-44f7945262ea @@ -0,0 +1 @@ +data image2Path := "path/to/image2.jpeg" imgBytes , _ := os . ReadFile ( image2Path ) parts := [] * genai . Part { genai . NewPartFromText ( "What is different between these two images?" ), genai . NewPartFromBytes ( imgBytes , "image/jpeg" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST # Upload the first image IMAGE1_PATH = "path/to/image1.jpg" MIME1_TYPE = $( file -b --mime-type " ${ IMAGE1_PATH } " ) NUM1_BYTES = $( wc -c < " ${ IMAGE1_PATH } " ) DISPLAY_NAME1 = IMAGE1 tmp_header_file1 = upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME1_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME1 } '}}" 2 > /dev/null upload_url1 = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file1 } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file1 } " curl " ${ upload_url1 } " \ -H "Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE1_PATH } " 2 > /dev/null > file_info1.json file1_uri = $( jq ".file.uri" file_info1.json ) echo file1_uri = $file1_uri # Prepare the second image (inline) IMAGE2_PATH = "path/to/image2.png" MIME2_TYPE = $( file -b --mime-type " ${ IMAGE2_PATH } " ) if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMAGE2_BASE64 = $( base64 $B64FLAGS $IMAGE2_PATH ) # Now generate content using both images curl \ No newline at end of file diff --git a/docstore/3420f638-fe3f-4e36-85a2-9c668a2daa79 b/docstore/3420f638-fe3f-4e36-85a2-9c668a2daa79 new file mode 100644 index 0000000000000000000000000000000000000000..aee05449de83b11c2592ad958994cdaf02f0141d --- /dev/null +++ b/docstore/3420f638-fe3f-4e36-85a2-9c668a2daa79 @@ -0,0 +1 @@ +done = true ; } else if ( message . toolCall ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Turn on the lights please' ; session . sendClientContent ({ turns : inputTurns }); let turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . modelTurn && turn . serverContent . modelTurn . parts ) { for ( const part of turn . serverContent . modelTurn . parts ) { if ( part . text ) { console . debug ( 'Received text: %s\n' , part . text ); } } } else if ( turn . toolCall ) { const functionResponses = []; for ( const fc of turn . toolCall . functionCalls ) { functionResponses . push ({ id : fc . id , name : fc . name , response : { result : "ok" } // simple, hard-coded function response }); } console . debug ( 'Sending tool response...\n' ); session . sendToolResponse ({ functionResponses : functionResponses }); } } // Check again for new messages turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . modelTurn && turn . serverContent . modelTurn . parts ) { for ( const part of turn . serverContent . modelTurn . parts ) { if ( part . text ) { console . debug ( 'Received text: %s\n' , part . text ); } } } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); From a single prompt, the model can generate multiple function calls and the code necessary to chain their outputs. This code executes in a sandbox environment, generating subsequent BidiGenerateContentToolCall messages. Asynchronous function calling Note: \ No newline at end of file diff --git a/docstore/34227108-a659-4eda-9312-b5746c008f37 b/docstore/34227108-a659-4eda-9312-b5746c008f37 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/34227108-a659-4eda-9312-b5746c008f37 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/342f6e6d-7712-47be-979d-c80dcea1457a b/docstore/342f6e6d-7712-47be-979d-c80dcea1457a new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/342f6e6d-7712-47be-979d-c80dcea1457a @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/3431b907-47a8-41bd-8508-abce33ec03b4 b/docstore/3431b907-47a8-41bd-8508-abce33ec03b4 new file mode 100644 index 0000000000000000000000000000000000000000..f032a4989a26fafb20f912c87361ffdb01685c65 --- /dev/null +++ b/docstore/3431b907-47a8-41bd-8508-abce33ec03b4 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting-intro Title: Prompt design strategies | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/34331f54-b299-40ab-88fa-6259b340f881 b/docstore/34331f54-b299-40ab-88fa-6259b340f881 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/34331f54-b299-40ab-88fa-6259b340f881 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/345296d8-387e-475d-9db8-f7bc45eda8f1 b/docstore/345296d8-387e-475d-9db8-f7bc45eda8f1 new file mode 100644 index 0000000000000000000000000000000000000000..6d0fb3dda30537d366b10ce141412a20984aa796 --- /dev/null +++ b/docstore/345296d8-387e-475d-9db8-f7bc45eda8f1 @@ -0,0 +1 @@ +where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels. """ config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # set thinking_budget to 0 for better results in object detection ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ prompt , im ], # Pillow images can be directly passed as inputs (which will be converted by the SDK) config = config ) # Parse JSON response items = json . loads ( parse_json ( response . text )) # Create output directory os . makedirs ( output_dir , exist_ok = True ) # Process each mask for i , item in enumerate ( items ): # Get bounding box coordinates box = item [ "box_2d" ] y0 = int ( box [ 0 ] / 1000 * im . size [ 1 ]) x0 = int ( box [ 1 ] / 1000 * im . size [ 0 ]) y1 = int ( box [ 2 ] / 1000 * im . size [ 1 ]) x1 = int ( box [ 3 ] / 1000 * im . size [ 0 ]) # Skip invalid boxes if y0 > = y1 or x0 > = x1 : continue # Process mask png_str = item [ "mask" ] if not png_str . startswith ( "data:image/png;base64," ): continue # Remove prefix png_str = png_str . removeprefix ( "data:image/png;base64," ) mask_data = base64 . b64decode ( png_str ) mask = Image . open ( io . BytesIO ( mask_data )) # Resize mask to match bounding box mask = mask . resize (( x1 - x0 , y1 - y0 ), Image . Resampling . BILINEAR ) # Convert mask to numpy array for processing mask_array = np . array ( mask ) # Create overlay for this mask overlay = Image . new ( 'RGBA' , im . size , ( 0 , 0 , 0 , 0 )) overlay_draw = ImageDraw . Draw ( overlay ) # Create overlay for the mask color = ( 255 , 255 , 255 , 200 ) for y in range ( y0 , y1 ): for x in range ( x0 , x1 ): if mask_array [ y - y0 , x - x0 ] > 128 : # Threshold for mask overlay_draw . point (( x , y ), fill = color ) # Save individual mask and its overlay mask_filename = f " { item [ 'label' ] } _ { i } _mask.png" \ No newline at end of file diff --git a/docstore/34719b70-4d73-405d-b130-f2162d3a7f75 b/docstore/34719b70-4d73-405d-b130-f2162d3a7f75 new file mode 100644 index 0000000000000000000000000000000000000000..6ba79c2c7031e707d89e2b23470a5d9ee375cc5c --- /dev/null +++ b/docstore/34719b70-4d73-405d-b130-f2162d3a7f75 @@ -0,0 +1 @@ +mcp.client.stdio import stdio_client from google import genai client = genai . Client () # Create server parameters for stdio connection server_params = StdioServerParameters ( command = "npx" , # Executable args = [ "-y" , "@philschmid/weather-mcp" ], # MCP Server env = None , # Optional environment variables ) async def run (): async with stdio_client ( server_params ) as ( read , write ): async with ClientSession ( read , write ) as session : # Prompt to get the weather for the current day in London. prompt = f "What is the weather in London in { datetime . now () . strftime ( '%Y-%m- %d ' ) } ?" # Initialize the connection between client and server await session . initialize () # Send request to the model with MCP function declarations response = await client . aio . models . generate_content ( model = "gemini-2.5-flash" , contents = prompt , config = genai . types . GenerateContentConfig ( temperature = 0 , tools = [ session ], # uses the session, will automatically call the tool # Uncomment if you **don't** want the SDK to automatically call the tool # automatic_function_calling=genai.types.AutomaticFunctionCallingConfig( # disable=True # ), ), ) print ( response . text ) # Start the asyncio event loop and run the main function asyncio . run ( run ()) JavaScript Make sure the latest version of the mcp SDK is installed on your platform of choice. npm install @modelcontextprotocol/sdk Note: JavaScript supports automatic tool calling by wrapping the client with mcpToTool . If you want to disable it, you can provide automaticFunctionCalling with disabled true . import { GoogleGenAI , FunctionCallingConfigMode , mcpToTool } from '@google/genai' ; import { Client } from "@modelcontextprotocol/sdk/client/index.js" ; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js" ; // Create server parameters for stdio connection const serverParams = new StdioClientTransport ({ command : "npx" , // Executable args : [ "-y" , "@philschmid/weather-mcp" \ No newline at end of file diff --git a/docstore/34755a68-14a2-4cac-8f1e-7105a734d114 b/docstore/34755a68-14a2-4cac-8f1e-7105a734d114 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/34755a68-14a2-4cac-8f1e-7105a734d114 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/3477b638-280d-4219-881d-6bf7532a804a b/docstore/3477b638-280d-4219-881d-6bf7532a804a new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/3477b638-280d-4219-881d-6bf7532a804a @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/348c23f2-7772-45e0-81cc-5cc044816b7e b/docstore/348c23f2-7772-45e0-81cc-5cc044816b7e new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/348c23f2-7772-45e0-81cc-5cc044816b7e @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/34b70c88-0c1c-4a02-91c5-39b881830ef4 b/docstore/34b70c88-0c1c-4a02-91c5-39b881830ef4 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/34b70c88-0c1c-4a02-91c5-39b881830ef4 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/34c6e8b5-788c-48cb-b8ca-a96bb6ca4f32 b/docstore/34c6e8b5-788c-48cb-b8ca-a96bb6ca4f32 new file mode 100644 index 0000000000000000000000000000000000000000..6acfe7ba02c229c83724a8d4179d85fae6727e02 --- /dev/null +++ b/docstore/34c6e8b5-788c-48cb-b8ca-a96bb6ca4f32 @@ -0,0 +1 @@ +images before performing its reasoning task. Prompt Model response What is in common between these images? All four images are examples of landscape photography. Updated prompt Improved response First, describe what's in each image in detail. What's in common between these images? The first image shows a person packing a suitcase. The second image shows an empty hallway in a shopping mall. The third image shows a roll of toilet paper. The fourth image shows a dark and stormy sea. The commonality between these images is that they are all examples of liminal spaces. Liminal spaces are places or situations that are in between two other places or situations. They are often characterized by a sense of uncertainty or ambiguity. The first image shows the person in between being at home and being on a trip. The second image shows the hallway in between two stores. The third image is of toilet paper, which can represent a bathroom. A bathroom is a transitional space between private and public realms. It's a place where we prepare ourselves to enter or leave the outside world. The fourth image shows the sea in between being calm and being stormy. Liminal spaces can be both exciting and unsettling. They can be a place of new beginning or a place of endings. They can be a place of reflection or a place of action. You can also try explicitly asking the model to refer to what's in the image. Updated prompt Improved response What is in common between these images? Refer to what's in the images in your response. All of the images are related to travel. The first image shows a person packing a suitcase, the second image shows an empty hallway in a shopping mall, the third image shows a stack of toilet paper, and the fourth image shows a dark and stormy sea. These images are all related to travel because they are all things that people need or use when they travel. The first image shows a person packing a suitcase, which is something that people do when they are going on a trip. The \ No newline at end of file diff --git a/docstore/34dd8e4d-4834-4f35-8822-d0b1dfee595c b/docstore/34dd8e4d-4834-4f35-8822-d0b1dfee595c new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/34dd8e4d-4834-4f35-8822-d0b1dfee595c @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/34e1797f-6755-4aae-a0ac-143c2c725c4a b/docstore/34e1797f-6755-4aae-a0ac-143c2c725c4a new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/34e1797f-6755-4aae-a0ac-143c2c725c4a @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/34e88eb7-21f3-4f84-af3a-9b10fd3a6cd0 b/docstore/34e88eb7-21f3-4f84-af3a-9b10fd3a6cd0 new file mode 100644 index 0000000000000000000000000000000000000000..3211ac16b1bc5cf788a258a14d07448add8397f9 --- /dev/null +++ b/docstore/34e88eb7-21f3-4f84-af3a-9b10fd3a6cd0 @@ -0,0 +1 @@ +{"file_data":{"mime_type": "application/pdf", "file_uri": ' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json # Clean up the downloaded PDF rm " ${ DISPLAY_NAME } .pdf" Large PDFs stored locally Python from google import genai from google.genai import types import pathlib import httpx client = genai . Client () # Retrieve and encode the PDF byte file_path = pathlib . Path ( 'large_file.pdf' ) # Upload the PDF using the File API sample_file = client . files . upload ( file = file_path , ) prompt = "Summarize this document" response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ sample_file , "Summarize this document" ]) print ( response . text ) JavaScript import { createPartFromUri , GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); async function main () { const file = await ai . files . upload ({ file : 'path-to-localfile.pdf' config : { displayName : 'A17_FlightPlan.pdf' , }, }); // Wait for the file to be processed. let getFile = await ai . files . get ({ name : file . name }); while ( getFile . state === 'PROCESSING' ) { getFile = await ai . files . get ({ name : file . name }); console . log ( `current file status: ${ getFile . state } ` ); console . log ( 'File is still processing, retrying in 5 seconds' ); await new Promise (( resolve ) = > { setTimeout ( resolve , 5000 ); }); } if ( file . state === 'FAILED' ) { throw new Error ( 'File processing failed.' ); } // Add the file to the contents. const content = [ 'Summarize this document' , ]; if ( file . uri && file . mimeType ) { const fileContent = createPartFromUri ( file . uri , file . mimeType ); content . push ( fileContent ); } const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : content , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "os" \ No newline at end of file diff --git a/docstore/34e89308-78f6-4a57-a86f-85466369fb28 b/docstore/34e89308-78f6-4a57-a86f-85466369fb28 new file mode 100644 index 0000000000000000000000000000000000000000..aee05449de83b11c2592ad958994cdaf02f0141d --- /dev/null +++ b/docstore/34e89308-78f6-4a57-a86f-85466369fb28 @@ -0,0 +1 @@ +done = true ; } else if ( message . toolCall ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Turn on the lights please' ; session . sendClientContent ({ turns : inputTurns }); let turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . modelTurn && turn . serverContent . modelTurn . parts ) { for ( const part of turn . serverContent . modelTurn . parts ) { if ( part . text ) { console . debug ( 'Received text: %s\n' , part . text ); } } } else if ( turn . toolCall ) { const functionResponses = []; for ( const fc of turn . toolCall . functionCalls ) { functionResponses . push ({ id : fc . id , name : fc . name , response : { result : "ok" } // simple, hard-coded function response }); } console . debug ( 'Sending tool response...\n' ); session . sendToolResponse ({ functionResponses : functionResponses }); } } // Check again for new messages turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . modelTurn && turn . serverContent . modelTurn . parts ) { for ( const part of turn . serverContent . modelTurn . parts ) { if ( part . text ) { console . debug ( 'Received text: %s\n' , part . text ); } } } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); From a single prompt, the model can generate multiple function calls and the code necessary to chain their outputs. This code executes in a sandbox environment, generating subsequent BidiGenerateContentToolCall messages. Asynchronous function calling Note: \ No newline at end of file diff --git a/docstore/34e9ed75-1f4d-4073-88cf-b599e03b1046 b/docstore/34e9ed75-1f4d-4073-88cf-b599e03b1046 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/34e9ed75-1f4d-4073-88cf-b599e03b1046 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/3546c529-775f-41de-89db-758417064756 b/docstore/3546c529-775f-41de-89db-758417064756 new file mode 100644 index 0000000000000000000000000000000000000000..42fbfa8d3a1b9c27b4f54909cff17ace224a9de6 --- /dev/null +++ b/docstore/3546c529-775f-41de-89db-758417064756 @@ -0,0 +1 @@ +over a happy ' 'futuristic scifi city with lots of greenery?' ) response = client . models . generate_content ( model = "gemini-2.0-flash-preview-image-generation" , contents = contents , config = types . GenerateContentConfig ( response_modalities = [ 'TEXT' , 'IMAGE' ] ) ) for part in response . candidates [ 0 ] . content . parts : if part . text is not None : print ( part . text ) elif part . inline_data is not None : image = Image . open ( BytesIO (( part . inline_data . data ))) image . save ( 'gemini-native-image.png' ) image . show () JavaScript Note: We've released the Google SDK for TypeScript and JavaScript in preview launch stage . Use this SDK for image generation features. import { GoogleGenAI , Modality } from "@google/genai" ; import * as fs from "node:fs" ; async function main () { const ai = new GoogleGenAI ({}); const contents = "Hi, can you create a 3d rendered image of a pig " + "with wings and a top hat flying over a happy " + "futuristic scifi city with lots of greenery?" ; // Set responseModalities to include "Image" so the model can generate an image const response = await ai . models . generateContent ({ model : "gemini-2.0-flash-preview-image-generation" , contents : contents , config : { responseModalities : [ Modality . TEXT , Modality . IMAGE ], }, }); for ( const part of response . candidates [ 0 ]. content . parts ) { // Based on the part type, either show the text or save the image if ( part . text ) { console . log ( part . text ); } else if ( part . inlineData ) { const imageData = part . inlineData . data ; const buffer = Buffer . from ( imageData , "base64" ); fs . writeFileSync ( "gemini-native-image.png" , buffer ); console . log ( "Image saved as gemini-native-image.png" ); } } } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . \ No newline at end of file diff --git a/docstore/3554b323-94b9-4836-8723-d4499f34ca05 b/docstore/3554b323-94b9-4836-8723-d4499f34ca05 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/3554b323-94b9-4836-8723-d4499f34ca05 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/3559a505-804e-4405-8f5a-bf7e81c29992 b/docstore/3559a505-804e-4405-8f5a-bf7e81c29992 new file mode 100644 index 0000000000000000000000000000000000000000..bf4a48096b84622083d96343210f25866e78f754 --- /dev/null +++ b/docstore/3559a505-804e-4405-8f5a-bf7e81c29992 @@ -0,0 +1 @@ +a picture of me. Can you add a llama next to me?" ), & genai . Part { InlineData : & genai . Blob { MIMEType : "image/png" , Data : imgData , }, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } config := & genai . GenerateContentConfig { ResponseModalities : [] string { "TEXT" , "IMAGE" }, } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash-preview-image-generation" , contents , config , ) for _ , part := range result . Candidates [ 0 ]. Content . Parts { if part . Text != "" { fmt . Println ( part . Text ) } else if part . InlineData != nil { imageBytes := part . InlineData . Data outputFilename := "gemini_generated_image.png" _ = os . WriteFile ( outputFilename , imageBytes , 0644 ) } } } REST IMG_PATH = /path/to/your/image1.jpeg if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMG_BASE64 = $( base64 " $B64FLAGS " " $IMG_PATH " 2>&1 ) curl -X POST \ "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-preview-image-generation:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d "{ \"contents\": [{ \"parts\":[ {\"text\": \"'Hi, This is a picture of me. Can you add a llama next to me\"}, { \"inline_data\": { \"mime_type\":\"image/jpeg\", \"data\": \" $IMG_BASE64 \" } } ] }], \"generationConfig\": {\"responseModalities\": [\"TEXT\", \"IMAGE\"]} }" \ | grep -o '"data": "[^"]*"' \ | cut -d '"' -f4 \ | base64 --decode > gemini-edited-image.png Other image generation modes Gemini supports other image interaction modes based on prompt structure and context, including: Text to image(s) and text (interleaved): Outputs images with related text. Example prompt: "Generate an illustrated recipe for a paella." Image(s) and text to image(s) and text (interleaved) : Uses input images and text to create new related images and text. Example prompt: (With an image of a furnished room) \ No newline at end of file diff --git a/docstore/355fdd75-8be0-4207-8b7e-bc717c02c712 b/docstore/355fdd75-8be0-4207-8b7e-bc717c02c712 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/355fdd75-8be0-4207-8b7e-bc717c02c712 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/3560e65b-f46c-4a8a-9851-15544af59361 b/docstore/3560e65b-f46c-4a8a-9851-15544af59361 new file mode 100644 index 0000000000000000000000000000000000000000..2dd19cc1c2d77863f4a301d62456871308dd74fd --- /dev/null +++ b/docstore/3560e65b-f46c-4a8a-9851-15544af59361 @@ -0,0 +1 @@ +"temperature" ], }, }, ], }, ]; // Prompt for the model let contents = [ { role : "user" , parts : [ { text : "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , }, ], }, ]; // Loop until the model has no more function calls to make while ( true ) { const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents , config : { tools }, }); if ( result . functionCalls && result . functionCalls . length > 0 ) { const functionCall = result . functionCalls [ 0 ]; const { name , args } = functionCall ; if ( ! toolFunctions [ name ]) { throw new Error ( `Unknown function call: ${ name } ` ); } // Call the function and get the response. const toolResponse = toolFunctions [ name ]( args ); const functionResponsePart = { name : functionCall . name , response : { result : toolResponse , }, }; // Send the function response back to the model. contents . push ({ role : "model" , parts : [ { functionCall : functionCall , }, ], }); contents . push ({ role : "user" , parts : [ { functionResponse : functionResponsePart , }, ], }); } else { // No more function calls, break the loop. console . log ( result . text ); break ; } } Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . It 's 25°C in London, so I' ve set the thermostat to 20 ° C . Compositional function calling is a native Live API feature. This means Live API can handle the function calling similar to the Python SDK. Python # Light control schemas turn_on_the_lights_schema = { 'name' : 'turn_on_the_lights' } \ No newline at end of file diff --git a/docstore/356293bc-df24-4440-b0c9-0ad72fa50760 b/docstore/356293bc-df24-4440-b0c9-0ad72fa50760 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/356293bc-df24-4440-b0c9-0ad72fa50760 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/356db14a-78cb-4a50-8f64-2c7917ed075b b/docstore/356db14a-78cb-4a50-8f64-2c7917ed075b new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/356db14a-78cb-4a50-8f64-2c7917ed075b @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/359a72d7-0f93-46fa-8b0a-cf04867b8e7d b/docstore/359a72d7-0f93-46fa-8b0a-cf04867b8e7d new file mode 100644 index 0000000000000000000000000000000000000000..b58c189db2b124c61872134bdd4b3d786a4e18e6 --- /dev/null +++ b/docstore/359a72d7-0f93-46fa-8b0a-cf04867b8e7d @@ -0,0 +1 @@ +you can read about configuring function calling . Python from google import genai from google.genai import types # Configure the client and tools client = genai . Client () house_tools = [ types . Tool ( function_declarations = [ power_disco_ball , start_music , dim_lights ]) ] config = types . GenerateContentConfig ( tools = house_tools , automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ), # Force the model to call 'any' function, instead of chatting. tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = 'ANY' ) ), ) chat = client . chats . create ( model = "gemini-2.5-flash" , config = config ) response = chat . send_message ( "Turn this place into a party!" ) # Print out each of the function calls requested from this single call print ( "Example 1: Forced function calling" ) for fn in response . function_calls : args = ", " . join ( f " { key } = { val } " for key , val in fn . args . items ()) print ( f " { fn . name } ( { args } )" ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Set up function declarations const houseFns = [ powerDiscoBall , startMusic , dimLights ]; const config = { tools : [{ functionDeclarations : houseFns }], // Force the model to call 'any' function, instead of chatting. toolConfig : { functionCallingConfig : { mode : 'any' } } }; // Configure the client const ai = new GoogleGenAI ({}); // Create a chat session const chat = ai . chats . create ({ model : 'gemini-2.5-flash' , config : config }); const response = await chat . sendMessage ({ message : 'Turn this place into a party!' }); // Print out each of the function calls requested from this single call console . log ( "Example 1: Forced function calling" ); for ( const fn of response . functionCalls ) { const args = Object . entries ( fn . args ) . map (([ key , val ]) = > ` ${ key } = ${ val } ` ) . join ( ', ' ); console . log ( ` ${ fn . name } ( ${ args } )` ); } Each of the printed \ No newline at end of file diff --git a/docstore/35ac759d-af48-4c96-874d-db7541103405 b/docstore/35ac759d-af48-4c96-874d-db7541103405 new file mode 100644 index 0000000000000000000000000000000000000000..ca0d44e9961a5c85cb8aeb7b443a141c9784fb0d --- /dev/null +++ b/docstore/35ac759d-af48-4c96-874d-db7541103405 @@ -0,0 +1 @@ +Image understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image understanding Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models. Tip: In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer improved accuracy for specific use cases like object detection and segmentation , through additional training. See the Capabilities section for more details. Passing images to Gemini You can provide images as input to Gemini using two methods: Passing inline image data : Ideal for smaller files (total request size less than 20MB, including prompts). Uploading images using the File API : Recommended for larger files or for reusing images across multiple requests. Passing inline image data You can pass inline image data in the request to generateContent . You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language). The following example shows how to read an image from a local file and pass it to generateContent API for processing. Python from google.genai import types with open ( 'path/to/small-sample.jpg' , 'rb' ) as f : image_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ types . Part . from_bytes ( data = image_bytes , mime_type = 'image/jpeg' , ), 'Caption this image.' ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs \ No newline at end of file diff --git a/docstore/35f2fa14-20c7-4b0b-b188-ff2050df7df4 b/docstore/35f2fa14-20c7-4b0b-b188-ff2050df7df4 new file mode 100644 index 0000000000000000000000000000000000000000..70023e78d7b8c9459310371555beec5ba6328e28 --- /dev/null +++ b/docstore/35f2fa14-20c7-4b0b-b188-ff2050df7df4 @@ -0,0 +1 @@ +results reflects a single function call that the model has requested. To send the results back, include the responses in the same order as they were requested. The Python SDK supports automatic function calling , which automatically converts Python functions to declarations, handles the function call execution and response cycle for you. Following is an example for the disco use case. Note: Automatic Function Calling is a Python SDK only feature at the moment. Python from google import genai from google.genai import types # Actual function implementations def power_disco_ball_impl ( power : bool ) - > dict : """Powers the spinning disco ball. Args: power: Whether to turn the disco ball on or off. Returns: A status dictionary indicating the current state. """ return { "status" : f "Disco ball powered { 'on' if power else 'off' } " } def start_music_impl ( energetic : bool , loud : bool ) - > dict : """Play some music matching the specified parameters. Args: energetic: Whether the music is energetic or not. loud: Whether the music is loud or not. Returns: A dictionary containing the music settings. """ music_type = "energetic" if energetic else "chill" volume = "loud" if loud else "quiet" return { "music_type" : music_type , "volume" : volume } def dim_lights_impl ( brightness : float ) - > dict : """Dim the lights. Args: brightness: The brightness of the lights, 0.0 is off, 1.0 is full. Returns: A dictionary containing the new brightness setting. """ return { "brightness" : brightness } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ power_disco_ball_impl , start_music_impl , dim_lights_impl ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Do everything you need to this place into party!" , config = config , ) print ( " \n Example 2: Automatic function calling" ) print ( response . text ) # I've turned on the disco ball, started playing loud and \ No newline at end of file diff --git a/docstore/364c889a-2abc-457d-9196-2233d2d11042 b/docstore/364c889a-2abc-457d-9196-2233d2d11042 new file mode 100644 index 0000000000000000000000000000000000000000..3ecdc47bdfdb376a1d8226f76a3e20fc1fff4015 --- /dev/null +++ b/docstore/364c889a-2abc-457d-9196-2233d2d11042 @@ -0,0 +1 @@ +"createTunedModel" : print ( m . name ) break # create tuning model training_dataset = types . TuningDataset ( examples = [ types . TuningExample ( text_input = f 'input { i } ' , output = f 'output { i } ' , ) for i in range ( 5 ) ], ) tuning_job = client . tunings . tune ( base_model = 'models/gemini-1.5-flash-001-tuning' , training_dataset = training_dataset , config = types . CreateTuningJobConfig ( epoch_count = 5 , batch_size = 4 , learning_rate = 0.001 , tuned_model_display_name = "test tuned model" ) ) # generate content with the tuned model response = client . models . generate_content ( model = tuning_job . tuned_model . model , contents = '55' , ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/docstore/365b3214-8d60-491d-872a-f9cb23afe8d2 b/docstore/365b3214-8d60-491d-872a-f9cb23afe8d2 new file mode 100644 index 0000000000000000000000000000000000000000..1d2463b6c11af951d3bab4a46bb6e7601785f7d8 --- /dev/null +++ b/docstore/365b3214-8d60-491d-872a-f9cb23afe8d2 @@ -0,0 +1 @@ +'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works in a few words" } ] } ] }' Meet the models Use Gemini in Google AI Studio 2.5 Pro spark Our most powerful thinking model with features for complex reasoning and much more 2.5 Flash spark Our newest multimodal model, with next generation features and improved capabilities 2.5 Flash-Lite spark Our fastest and most cost-efficient multimodal model with great performance for high-frequency tasks Explore the API Native Image Generation Generate and edit highly contextual images natively with Gemini 2.0 Flash. Explore long context Input millions of tokens to Gemini models and derive understanding from unstructured images, videos, and documents. Generate structured outputs Constrain Gemini to respond with JSON, a structured data format suitable for automated processing. Start building with the Gemini API Get started Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/36b030c6-80c9-4037-bfa5-686d0de5320d b/docstore/36b030c6-80c9-4037-bfa5-686d0de5320d new file mode 100644 index 0000000000000000000000000000000000000000..4b6418baecebd23eec6598a4eb723dc1516263bd --- /dev/null +++ b/docstore/36b030c6-80c9-4037-bfa5-686d0de5320d @@ -0,0 +1 @@ +default media resolution or 6 hours long at low media resolution, while models with a 1M context window can process videos up to 1 hour long at default media resolution or 3 hours long at low media resolution. File API processing : When using the File API, videos are sampled at 1 frame per second (FPS) and audio is processed at 1Kbps (single channel). Timestamps are added every second. These rates are subject to change in the future for improvements in inference. Token calculation : Each second of video is tokenized as follows: Individual frames (sampled at 1 FPS): If mediaResolution is set to low, frames are tokenized at 66 tokens per frame. Otherwise, frames are tokenized at 258 tokens per frame. Audio: 32 tokens per second. Metadata is also included. Total: Approximately 300 tokens per second of video at default media resolution, or 100 tokens per second of video at low media resolution. Timestamp format : When referring to specific moments in a video within your prompt, use the MM:SS format (e.g., 01:15 for 1 minute and 15 seconds). Best practices : Use only one video per prompt request for optimal results. If combining text and a single video, place the text prompt after the video part in the contents array. Be aware that fast action sequences might lose detail due to the 1 FPS sampling rate. Consider slowing down such clips if necessary. What's next This guide shows how to upload video files and generate text outputs from video inputs. To learn more, see the following resources: System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. Files API : Learn more about uploading and managing files for use with Gemini. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. \ No newline at end of file diff --git a/docstore/36bae5a1-186c-4c81-8ac2-db6bb8ef1025 b/docstore/36bae5a1-186c-4c81-8ac2-db6bb8ef1025 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/36bae5a1-186c-4c81-8ac2-db6bb8ef1025 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/36c527e9-38c8-4375-a857-83b7b72f6cb0 b/docstore/36c527e9-38c8-4375-a857-83b7b72f6cb0 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/36c527e9-38c8-4375-a857-83b7b72f6cb0 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/36cdf045-c110-4782-95e8-a8fbd2922df7 b/docstore/36cdf045-c110-4782-95e8-a8fbd2922df7 new file mode 100644 index 0000000000000000000000000000000000000000..7a617ceacc5e968d9729ffe6ff8f1e15b90d626d --- /dev/null +++ b/docstore/36cdf045-c110-4782-95e8-a8fbd2922df7 @@ -0,0 +1 @@ +multiple attempts yield the best results. Keep it short : Limit text to 25 characters or less for optimal generation. Multiple phrases : Experiment with two or three distinct phrases to provide additional information. Avoid exceeding three phrases for cleaner compositions. Prompt: A poster with the text "Summerland" in bold font as a title, underneath this text is the slogan "Summer never felt so good" Guide Placement : While Imagen can attempt to position text as directed, expect occasional variations. This feature is continually improving. Inspire font style : Specify a general font style to subtly influence Imagen's choices. Don't rely on precise font replication, but expect creative interpretations. Font size : Specify a font size or a general indication of size (for example, small , medium , large ) to influence the font size generation. Prompt parameterization To better control output results, you might find it helpful to parameterize the inputs into Imagen. For example, suppose you want your customers to be able to generate logos for their business, and you want to make sure logos are always generated on a solid color background. You also want to limit the options that the client can select from a menu. In this example, you can create a parameterized prompt similar to the following: A {logo_style} logo for a {company_area} company on a solid color background. Include the text {company_name} . In your custom user interface, the customer can input the parameters using a menu, and their chosen value populates the prompt Imagen receives. For example: Prompt: A minimalist logo for a health care company on a solid color background. Include the text Journey . Prompt: A modern logo for a software company on a solid color background. Include the text Silo . Prompt: A traditional logo for a baking company on a solid color background. Include the text Seed . Advanced prompt writing techniques Use the following examples to create more specific prompts based on attributes \ No newline at end of file diff --git a/docstore/36def066-b322-419f-a4e7-7a9f03bd8331 b/docstore/36def066-b322-419f-a4e7-7a9f03bd8331 new file mode 100644 index 0000000000000000000000000000000000000000..4a8a7222dfc27acfaa73b21a084913914a78851b --- /dev/null +++ b/docstore/36def066-b322-419f-a4e7-7a9f03bd8331 @@ -0,0 +1 @@ +"fmt" "google.golang.org/genai" "os" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := genai . Text ( "What is the sum of the first 50 prime numbers?" ) model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { IncludeThoughts : true , }, }) for _ , part := range resp . Candidates [ 0 ]. Content . Parts { if part . Text != "" { if part . Thought { fmt . Println ( "Thoughts Summary:" ) fmt . Println ( part . Text ) } else { fmt . Println ( "Answer:" ) fmt . Println ( part . Text ) } } } } And here is an example using thinking with streaming, which returns rolling, incremental summaries during generation: Python from google import genai from google.genai import types client = genai . Client () prompt = """ Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own? """ thoughts = "" answer = "" for chunk in client . models . generate_content_stream ( model = "gemini-2.5-pro" , contents = prompt , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( include_thoughts = True ) ) ): for part in chunk . candidates [ 0 ] . content . parts : if not part . text : continue elif part . thought : if not thoughts : print ( "Thoughts summary:" ) print ( part . text ) thoughts += part . text else : if not answer : print ( "Answer:" ) print ( part . text ) answer += part . text JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const prompt = `Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The \ No newline at end of file diff --git a/docstore/36ed32f1-1501-4104-9d30-38dda9596e33 b/docstore/36ed32f1-1501-4104-9d30-38dda9596e33 new file mode 100644 index 0000000000000000000000000000000000000000..a6b24f2239a055f1d65a9fc8ecf3b6c3213634db --- /dev/null +++ b/docstore/36ed32f1-1501-4104-9d30-38dda9596e33 @@ -0,0 +1 @@ +min | 12/2/2023 | 12/4/2023 | | Physics | In progress | 2 hrs | 12/3/2023 | 12/5/2023 | | Biology | Not started | 6 hrs | 11/2/2016 | 12/6/2023 | Prompt Model response Provide a list of all the following attributes: ingredients, type of cuisine, vegetarian or not, in JSON format ```json { "ingredients": [ "rice", "shrimp", "clams", "mussels", "peas", "tomatoes", "onions", "garlic", "olive oil", "paprika", "salt", "pepper" ], "type of cuisine": "Spanish", "vegetarian": false } ``` Put your image first for single-image prompts While Gemini can interpret image and text in any order within a prompt, placing a single image before the text prompt might lead to better results. In the following samples, you'll notice the image comes first before the text in each prompt. Troubleshooting your multimodal prompt You might need to troubleshoot your prompt if you are not getting a helpful response. Here are a few strategies you could try. If the model is not drawing information from the relevant part of the image To get a more specific response, you can point out which aspects of the image you want the prompt to draw information from to inform its response. Prompt Model response How many days will these diapers last a baby? The diapers will last for 198 days before they run out. Updated prompt Improved response How long will these diapers last before I run out? Use the weight shown on the box to determine the child's age, and use the total number of diapers in the box. Divide the total number by how many diapers the child goes through per day. The box contains 198 diapers. The box states the diapers are for a child weighing 22-37 lbs. The average child in this weight range is likely between 1 and 2 years old. A 2-year-old typically goes through 6 diapers per day, therefore, the diapers will last around 33 days (198/6). If the model output is too generic and not tailored enough to the image input To help the model tailor its response to the image(s), try asking it to describe the \ No newline at end of file diff --git a/docstore/36fd2dc9-0d28-4b8c-aa4a-a6d6b612d6ed b/docstore/36fd2dc9-0d28-4b8c-aa4a-a6d6b612d6ed new file mode 100644 index 0000000000000000000000000000000000000000..aa09fa8779a782eb0f4519da995c2b766869468f --- /dev/null +++ b/docstore/36fd2dc9-0d28-4b8c-aa4a-a6d6b612d6ed @@ -0,0 +1 @@ +genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } Refer to timestamps You can refer to specific sections of an audio file using timestamps of the form MM:SS . For example, the following prompt requests a transcript that Starts at 2 minutes 30 seconds from the beginning of the file. Ends at 3 minutes 29 seconds from the beginning of the file. Python # Create a prompt containing timestamps. prompt = "Provide a transcript of the speech from 02:30 to 03:29." JavaScript // Create a prompt containing timestamps. const prompt = "Provide a transcript of the speech from 02:30 to 03:29." Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } localAudioPath := "/path/to/sample.mp3" uploadedFile , _ := client . Files . UploadFromPath ( ctx , localAudioPath , nil , ) parts := [] * genai . Part { genai . NewPartFromText ( "Provide a transcript of the speech " + "between the timestamps 02:30 and 03:29." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } Count tokens Call the countTokens method to get a count of the number of tokens in an audio file. For example: Python response = client . models . count_tokens ( model = 'gemini-2.5-flash' , contents = [ myfile ] ) print ( response ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); const myfile = await ai \ No newline at end of file diff --git a/docstore/36fd6c98-ff14-4495-b758-659b36c78fc1 b/docstore/36fd6c98-ff14-4495-b758-659b36c78fc1 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/36fd6c98-ff14-4495-b758-659b36c78fc1 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/36fe941d-fd12-408a-81cb-75792d4973d2 b/docstore/36fe941d-fd12-408a-81cb-75792d4973d2 new file mode 100644 index 0000000000000000000000000000000000000000..4542d68c23b70ac5014267f60edbcffd2138877f --- /dev/null +++ b/docstore/36fe941d-fd12-408a-81cb-75792d4973d2 @@ -0,0 +1 @@ +, headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Streaming can also be used for multi-turn conversations. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message_stream ( "I have 2 dogs in my house." ) for chunk in response : print ( chunk . text , end = "" ) response = chat . send_message_stream ( "How many paws are in my house?" ) for chunk in response : print ( chunk . text , end = "" ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const stream1 = await chat . sendMessageStream ({ message : "I have 2 dogs in my house." , }); for await ( const chunk of stream1 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } const stream2 = await chat . sendMessageStream ({ message : "How many paws are in my house?" , }); for await ( const chunk of stream2 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . \ No newline at end of file diff --git a/docstore/37188171-7386-474c-b927-90bd37758b2c b/docstore/37188171-7386-474c-b927-90bd37758b2c new file mode 100644 index 0000000000000000000000000000000000000000..41c5d7c70c10b0c099f849b39a650a62d6333896 --- /dev/null +++ b/docstore/37188171-7386-474c-b927-90bd37758b2c @@ -0,0 +1 @@ +npm install @modelcontextprotocol/sdk Note: JavaScript supports automatic tool calling by wrapping the client with mcpToTool . If you want to disable it, you can provide automaticFunctionCalling with disabled true . import { GoogleGenAI , FunctionCallingConfigMode , mcpToTool } from '@google/genai' ; import { Client } from "@modelcontextprotocol/sdk/client/index.js" ; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js" ; // Create server parameters for stdio connection const serverParams = new StdioClientTransport ({ command : "npx" , // Executable args : [ "-y" , "@philschmid/weather-mcp" ] // MCP Server }); const client = new Client ( { name : "example-client" , version : "1.0.0" } ); // Configure the client const ai = new GoogleGenAI ({}); // Initialize the connection between client and server await client . connect ( serverParams ); // Send request to the model with MCP tools const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : `What is the weather in London in ${ new Date (). toLocaleDateString () } ?` , config : { tools : [ mcpToTool ( client )], // uses the session, will automatically call the tool // Uncomment if you **don't** want the sdk to automatically call the tool // automaticFunctionCalling: { // disable: true, // }, }, }); console . log ( response . text ) // Close the connection await client . close (); Limitations with built-in MCP support Built-in MCP support is a experimental feature in our SDKs and has the following limitations: Only tools are supported, not resources nor prompts It is available for the Python and JavaScript/TypeScript SDK. Breaking changes might occur in future releases. Manual integration of MCP servers is always an option if these limit what you're building. Supported models This section lists models and their function calling capabilities. Experimental models are not included. You can find a comprehensive capabilities overview on the model overview page. \ No newline at end of file diff --git a/docstore/372401e9-f0b2-4482-9ca4-051565c1c62b b/docstore/372401e9-f0b2-4482-9ca4-051565c1c62b new file mode 100644 index 0000000000000000000000000000000000000000..5517c0bf6b0252fb1080acbdd22b2b409d2663d8 --- /dev/null +++ b/docstore/372401e9-f0b2-4482-9ca4-051565c1c62b @@ -0,0 +1 @@ +signatures, function call and result of the function execution to contents function_call_content = response . candidates [ 0 ] . content # Append the model's function call message, which includes thought signatures contents . append ( function_call_content ) contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Step 4: Create user friendly response with function result and call the model again // ...Create a function response part (No change) // Append thought signatures, function call and result of the function execution to contents const function_response_content = response . candidates [ 0 ]. content ; contents . push ( function_response_content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); The following shows what a request returning a thought signature may look like: [{ "contents" : [ { "role" : "user" , "parts" : [ { "text" : "what is the weather in Lake Tahoe?" } ] } , { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/372e088e-33c3-4705-ae62-d4d38ac02b55 b/docstore/372e088e-33c3-4705-ae62-d4d38ac02b55 new file mode 100644 index 0000000000000000000000000000000000000000..44112a5590578bd0d81bd7d1053c465e40ca11dd --- /dev/null +++ b/docstore/372e088e-33c3-4705-ae62-d4d38ac02b55 @@ -0,0 +1 @@ +tokens for other Live API models Supported languages Live API supports the following languages. Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Language BCP-47 Code Language BCP-47 Code German (Germany) de-DE English (Australia)* en-AU English (UK)* en-GB English (India) en-IN English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Portuguese (Brazil) pt-BR Arabic (Generic) ar-XA Spanish (Spain)* es-ES French (Canada)* fr-CA Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Bengali (India) bn-IN Gujarati (India)* gu-IN Kannada (India)* kn-IN Marathi (India) mr-IN Malayalam (India)* ml-IN Tamil (India) ta-IN Telugu (India) te-IN Dutch (Netherlands) nl-NL Korean (South Korea) ko-KR Mandarin Chinese (China)* cmn-CN Polish (Poland) pl-PL Russian (Russia) ru-RU Thai (Thailand) th-TH Languages marked with an asterisk (*) are not available for Native audio . What's next Read the Tool Use and Session Management guides for essential information on using the Live API effectively. Try the Live API in Google AI Studio . For more info about the Live API models, see Gemini 2.0 Flash Live and Gemini 2.5 Flash Native Audio on the Models page. Try more examples in the Live API cookbook , the Live API Tools cookbook , and the Live API Get Started script . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/3734203d-2753-4b97-8a6e-ceb3da37f370 b/docstore/3734203d-2753-4b97-8a6e-ceb3da37f370 new file mode 100644 index 0000000000000000000000000000000000000000..2c0be12e5de5e086f593ed916f2fdecc774a45a4 --- /dev/null +++ b/docstore/3734203d-2753-4b97-8a6e-ceb3da37f370 @@ -0,0 +1 @@ +Text generation | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Text generation The Gemini API can generate text output from various inputs, including text, images, video, and audio, leveraging Gemini models. Here's a basic example that takes a single text input: Python from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents \ No newline at end of file diff --git a/docstore/37548d21-5dfe-4016-b872-1b773824c3e8 b/docstore/37548d21-5dfe-4016-b872-1b773824c3e8 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/37548d21-5dfe-4016-b872-1b773824c3e8 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/375e0476-2348-4840-994b-b84cf38bda62 b/docstore/375e0476-2348-4840-994b-b84cf38bda62 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/375e0476-2348-4840-994b-b84cf38bda62 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/3761c7be-d372-4e72-83d4-4482024120c7 b/docstore/3761c7be-d372-4e72-83d4-4482024120c7 new file mode 100644 index 0000000000000000000000000000000000000000..6a0f5762f2e47222d475421a2613ce0f732fa260 --- /dev/null +++ b/docstore/3761c7be-d372-4e72-83d4-4482024120c7 @@ -0,0 +1 @@ +in the Gemini API by setting clipping intervals or providing custom frame rate sampling. Tip: Video clipping and frames per second (FPS) are supported by all models, but the quality is significantly higher from 2.5 series models. Set clipping intervals You can clip video by specifying videoMetadata with start and end offsets. Python response = client . models . generate_content ( model = 'models/gemini-2.5-flash-preview-05-20' , contents = types . Content ( parts = [ types . Part ( file_data = types . FileData ( file_uri = 'https://www.youtube.com/watch?v=XEzRZ35urlk' ), video_metadata = types . VideoMetadata ( start_offset = '1250s' , end_offset = '1570s' ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) Set a custom frame rate You can set custom frame rate sampling by passing an fps argument to videoMetadata . Python # Only for videos of size <20Mb video_file_name = "/path/to/your/video.mp4" video_bytes = open ( video_file_name , 'rb' ) . read () response = client . models . generate_content ( model = 'models/gemini-2.5-flash-preview-05-20' , contents = types . Content ( parts = [ types . Part ( inline_data = types . Blob ( data = video_bytes , mime_type = 'video/mp4' ), video_metadata = types . VideoMetadata ( fps = 5 ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) By default 1 frame per second (FPS) is sampled from the video. You might want to set low FPS (< 1) for long videos. This is especially useful for mostly static videos (e.g. lectures). If you want to capture more details in rapidly changing visuals, consider setting a higher FPS value. Supported video formats Gemini supports the following video format MIME types: video/mp4 video/mpeg video/mov video/avi video/x-flv video/mpg video/webm video/wmv video/3gpp Technical details about videos Supported models & context : All Gemini 2.0 and 2.5 models can process video data. Models with a 2M context window can process videos up to 2 hours long at \ No newline at end of file diff --git a/docstore/3780d839-243a-46d0-9cc2-2cbd5126ad82 b/docstore/3780d839-243a-46d0-9cc2-2cbd5126ad82 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/3780d839-243a-46d0-9cc2-2cbd5126ad82 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/3782e6c7-86d6-4822-bf77-f31a8efbbb0d b/docstore/3782e6c7-86d6-4822-bf77-f31a8efbbb0d new file mode 100644 index 0000000000000000000000000000000000000000..1d2463b6c11af951d3bab4a46bb6e7601785f7d8 --- /dev/null +++ b/docstore/3782e6c7-86d6-4822-bf77-f31a8efbbb0d @@ -0,0 +1 @@ +'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works in a few words" } ] } ] }' Meet the models Use Gemini in Google AI Studio 2.5 Pro spark Our most powerful thinking model with features for complex reasoning and much more 2.5 Flash spark Our newest multimodal model, with next generation features and improved capabilities 2.5 Flash-Lite spark Our fastest and most cost-efficient multimodal model with great performance for high-frequency tasks Explore the API Native Image Generation Generate and edit highly contextual images natively with Gemini 2.0 Flash. Explore long context Input millions of tokens to Gemini models and derive understanding from unstructured images, videos, and documents. Generate structured outputs Constrain Gemini to respond with JSON, a structured data format suitable for automated processing. Start building with the Gemini API Get started Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/37b52ffd-7f9d-4b80-88ad-a055a1abe0f0 b/docstore/37b52ffd-7f9d-4b80-88ad-a055a1abe0f0 new file mode 100644 index 0000000000000000000000000000000000000000..40564cc3a339b41e3f9c5a2f24a7d0082d31abf9 --- /dev/null +++ b/docstore/37b52ffd-7f9d-4b80-88ad-a055a1abe0f0 @@ -0,0 +1 @@ +response_modalities = [ "AUDIO" ], context_window_compression = ( # Configures compression with default parameters. types . ContextWindowCompressionConfig ( sliding_window = types . SlidingWindow (), ) ), ) JavaScript const config = { responseModalities : [ Modality . AUDIO ], contextWindowCompression : { slidingWindow : {} } }; Session resumption To prevent session termination when the server periodically resets the WebSocket connection, configure the sessionResumption field within the setup configuration . Passing this configuration causes the server to send SessionResumptionUpdate messages, which can be used to resume the session by passing the last resumption token as the SessionResumptionConfig.handle of the subsequent connection. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" async def main (): print ( f "Connecting to the service with handle { previous_session_handle } ..." ) async with client . aio . live . connect ( model = model , config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], session_resumption = types . SessionResumptionConfig ( # The handle of the session to resume is passed here, # or else None to start a new session. handle = previous_session_handle ), ), ) as session : while True : await session . send_client_content ( turns = types . Content ( role = "user" , parts = [ types . Part ( text = "Hello world!" )] ) ) async for message in session . receive (): # Periodically, the server will send update messages that may # contain a handle for the current state of the session. if message . session_resumption_update : update = message . session_resumption_update if update . resumable and update . new_handle : # The handle should be retained and linked to the session. return update . new_handle # For the purposes of this example, placeholder input is continually fed # to the model. In non-sample code, the model inputs would come from # \ No newline at end of file diff --git a/docstore/37c5d513-146f-4676-ad19-2f3e790cdaa5 b/docstore/37c5d513-146f-4676-ad19-2f3e790cdaa5 new file mode 100644 index 0000000000000000000000000000000000000000..433635003046509e85b7917fbaa1cad75744aec9 --- /dev/null +++ b/docstore/37c5d513-146f-4676-ad19-2f3e790cdaa5 @@ -0,0 +1 @@ +GenerateContentRequest inline_requests = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Tell me a one-sentence joke.' }], 'role' : 'user' }] }, { 'contents' : [{ 'parts' : [{ 'text' : 'Why is the sky blue?' }], 'role' : 'user' }] } ] inline_batch_job = client . batches . create ( model = "models/gemini-2.5-flash" , src = inline_requests , config = { 'display_name' : "inlined-requests-job-1" , }, ) print ( f "Created batch job: { inline_batch_job . name } " ) REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:batchGenerateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -X POST \ -H "Content-Type:application/json" \ -d '{ "batch": { "display_name": "my-batch-requests", "input_config": { "requests": { "requests": [ { "request": {"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}]}, "metadata": { "key": "request-1" } }, { "request": {"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}]}, "metadata": { "key": "request-2" } } ] } } } }' You can use any requests you would use in non-batch (or interactive) mode. For example, you could specify the temperature, system instructions or even pass in other modalities. The following example shows some example inline requests that contain a system instruction for one of the requests: inline_requests_list = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Write a short poem about a cloud.' }]}]}, { 'contents' : [{ 'parts' : [{ 'text' : 'Write a short poem about a cat.' }]}], 'system_instructions' : { 'parts' : [{ 'text' : 'You are a cat. Your name is Neko.' }]}} ] Similarly can also specify tools to use for a request. The following example shows a request that enables the Google Search tool : inline_requests_list = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Who won the euro 1998?' }]}]}, { 'contents' : [{ 'parts' : [{ 'text' : 'Who won the euro 2025?' }]}], 'tools' : [{ 'google_search ' : {}}]} ] Input file For larger sets of requests, prepare a JSON Lines \ No newline at end of file diff --git a/docstore/37cf113e-ac06-4f8a-b24f-cfec69770d2f b/docstore/37cf113e-ac06-4f8a-b24f-cfec69770d2f new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/37cf113e-ac06-4f8a-b24f-cfec69770d2f @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/37d67931-8d37-40da-89e1-54b7e0cbcce6 b/docstore/37d67931-8d37-40da-89e1-54b7e0cbcce6 new file mode 100644 index 0000000000000000000000000000000000000000..6b1a985d58ba09ec62238bf4586b9d6d54544db9 --- /dev/null +++ b/docstore/37d67931-8d37-40da-89e1-54b7e0cbcce6 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/live#implementation-approach Title: Get started with Live API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/37ecb4ea-76d7-4b78-ab3e-d6be26387d3e b/docstore/37ecb4ea-76d7-4b78-ab3e-d6be26387d3e new file mode 100644 index 0000000000000000000000000000000000000000..a6b24f2239a055f1d65a9fc8ecf3b6c3213634db --- /dev/null +++ b/docstore/37ecb4ea-76d7-4b78-ab3e-d6be26387d3e @@ -0,0 +1 @@ +min | 12/2/2023 | 12/4/2023 | | Physics | In progress | 2 hrs | 12/3/2023 | 12/5/2023 | | Biology | Not started | 6 hrs | 11/2/2016 | 12/6/2023 | Prompt Model response Provide a list of all the following attributes: ingredients, type of cuisine, vegetarian or not, in JSON format ```json { "ingredients": [ "rice", "shrimp", "clams", "mussels", "peas", "tomatoes", "onions", "garlic", "olive oil", "paprika", "salt", "pepper" ], "type of cuisine": "Spanish", "vegetarian": false } ``` Put your image first for single-image prompts While Gemini can interpret image and text in any order within a prompt, placing a single image before the text prompt might lead to better results. In the following samples, you'll notice the image comes first before the text in each prompt. Troubleshooting your multimodal prompt You might need to troubleshoot your prompt if you are not getting a helpful response. Here are a few strategies you could try. If the model is not drawing information from the relevant part of the image To get a more specific response, you can point out which aspects of the image you want the prompt to draw information from to inform its response. Prompt Model response How many days will these diapers last a baby? The diapers will last for 198 days before they run out. Updated prompt Improved response How long will these diapers last before I run out? Use the weight shown on the box to determine the child's age, and use the total number of diapers in the box. Divide the total number by how many diapers the child goes through per day. The box contains 198 diapers. The box states the diapers are for a child weighing 22-37 lbs. The average child in this weight range is likely between 1 and 2 years old. A 2-year-old typically goes through 6 diapers per day, therefore, the diapers will last around 33 days (198/6). If the model output is too generic and not tailored enough to the image input To help the model tailor its response to the image(s), try asking it to describe the \ No newline at end of file diff --git a/docstore/380fd21b-67cd-4184-8c29-435d50a9e396 b/docstore/380fd21b-67cd-4184-8c29-435d50a9e396 new file mode 100644 index 0000000000000000000000000000000000000000..e7df3841ca47d093238d5ecfeb9f7cc95942f8e5 --- /dev/null +++ b/docstore/380fd21b-67cd-4184-8c29-435d50a9e396 @@ -0,0 +1 @@ +data image2Path := "path/to/image2.jpeg" imgBytes , _ := os . ReadFile ( image2Path ) parts := [] * genai . Part { genai . NewPartFromText ( "What is different between these two images?" ), genai . NewPartFromBytes ( imgBytes , "image/jpeg" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST # Upload the first image IMAGE1_PATH = "path/to/image1.jpg" MIME1_TYPE = $( file -b --mime-type " ${ IMAGE1_PATH } " ) NUM1_BYTES = $( wc -c < " ${ IMAGE1_PATH } " ) DISPLAY_NAME1 = IMAGE1 tmp_header_file1 = upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME1_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME1 } '}}" 2 > /dev/null upload_url1 = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file1 } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file1 } " curl " ${ upload_url1 } " \ -H "Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE1_PATH } " 2 > /dev/null > file_info1.json file1_uri = $( jq ".file.uri" file_info1.json ) echo file1_uri = $file1_uri # Prepare the second image (inline) IMAGE2_PATH = "path/to/image2.png" MIME2_TYPE = $( file -b --mime-type " ${ IMAGE2_PATH } " ) if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMAGE2_BASE64 = $( base64 $B64FLAGS $IMAGE2_PATH ) # Now generate content using both images curl \ No newline at end of file diff --git a/docstore/383d30dd-92e9-4865-abb8-ea6cbe801834 b/docstore/383d30dd-92e9-4865-abb8-ea6cbe801834 new file mode 100644 index 0000000000000000000000000000000000000000..b7772256110e1184cd9828af16aaf33fa099feda --- /dev/null +++ b/docstore/383d30dd-92e9-4865-abb8-ea6cbe801834 @@ -0,0 +1 @@ +enlisting people in 'red teams' to try and break your application. In automated testing, the 'red team' is another language model that finds input text that elicit harmful outputs from the model being tested. Note: LLMs are known to sometimes produce different outputs for the same input prompt. Multiple rounds of testing may be needed to catch more of the problematic outputs. Monitor for problems No matter how much you test and mitigate, you can never guarantee perfection, so plan upfront how you'll spot and deal with problems that arise. Common approaches include setting up a monitored channel for users to share feedback (e.g., thumbs up/down rating) and running a user study to proactively solicit feedback from a diverse mix of users — especially valuable if usage patterns are different to expectations. Advanced tips When users give feedback to AI products, it can greatly improve the AI performance and the user experience over time by, for example, helping you choose better examples for prompt tuning. The Feedback and Control chapter in Google's People and AI guidebook highlights key considerations to take into account when designing feedback mechanisms. Next steps Refer to the safety settings guide to learn about the adjustable safety settings available through the Gemini API. See the intro to prompting to get started writing your first prompts. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-02-25 UTC. \ No newline at end of file diff --git a/docstore/3859d406-93e6-4011-b472-fa56a8596cff b/docstore/3859d406-93e6-4011-b472-fa56a8596cff new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/3859d406-93e6-4011-b472-fa56a8596cff @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/389d8d3c-7c69-46bd-a8fa-50a36667e383 b/docstore/389d8d3c-7c69-46bd-a8fa-50a36667e383 new file mode 100644 index 0000000000000000000000000000000000000000..c25aa08b43110ad03ba57bdce48865495fdfb941 --- /dev/null +++ b/docstore/389d8d3c-7c69-46bd-a8fa-50a36667e383 @@ -0,0 +1 @@ +fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ], "generationConfig": { "thinkingConfig": { "thinkingBudget": 0 } } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } System instructions and other configurations You can guide the behavior of Gemini models with system instructions. To do so, pass a GenerateContentConfig object. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , config = types . GenerateContentConfig ( system_instruction = "You are a cat. Your name is Neko." ), contents = "Hello there" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Hello there" , config : { systemInstruction : "You are a cat. Your name is Neko." , }, }); console . log ( response . text ); } await main (); \ No newline at end of file diff --git a/docstore/38a01015-87da-4897-948d-b8804862e03e b/docstore/38a01015-87da-4897-948d-b8804862e03e new file mode 100644 index 0000000000000000000000000000000000000000..8ae055ee25ee10e0bf5368b2d5c01f7fd2abd6b2 --- /dev/null +++ b/docstore/38a01015-87da-4897-948d-b8804862e03e @@ -0,0 +1 @@ +model can then retrieve content from the URLs and use that content to inform and shape its response. You can try examples of using tools with thinking models in the Thinking cookbook . What's next? To work through more in depth examples, like: Using tools with thinking Streaming with thinking Adjusting the thinking budget for different results and more, try our Thinking cookbook . Thinking coverage is now available in our OpenAI Compatibility guide. For more info about Gemini 2.5 Pro, Gemini Flash 2.5, and Gemini 2.5 Flash-Lite, visit the model page . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-11 UTC. \ No newline at end of file diff --git a/docstore/38ab2b7d-4f10-4235-9165-ccee745e8ebd b/docstore/38ab2b7d-4f10-4235-9165-ccee745e8ebd new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/38ab2b7d-4f10-4235-9165-ccee745e8ebd @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/38b0d9d2-2c8e-432f-b0f0-d033c0d5fea0 b/docstore/38b0d9d2-2c8e-432f-b0f0-d033c0d5fea0 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/38b0d9d2-2c8e-432f-b0f0-d033c0d5fea0 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/38b2690f-6f2e-4905-8e0a-b514f7d97114 b/docstore/38b2690f-6f2e-4905-8e0a-b514f7d97114 new file mode 100644 index 0000000000000000000000000000000000000000..b1044b06e974ef70df5275060bd78c27b49af935 --- /dev/null +++ b/docstore/38b2690f-6f2e-4905-8e0a-b514f7d97114 @@ -0,0 +1 @@ +ordering of the examples is not consistent with the property ordering of the schema, the output could be rambling or unexpected. To ensure a consistent, predictable ordering of properties, you can use the optional propertyOrdering[] field. "propertyOrdering" : [ "recipeName" , "ingredients" ] propertyOrdering[] – not a standard field in the OpenAPI specification – is an array of strings used to determine the order of properties in the response. By specifying the order of properties and then providing examples with properties in that same order, you can potentially improve the quality of results. propertyOrdering is only supported when you manually create types.Schema . Schemas in Python When you're using the Python library, the value of response_schema must be one of the following: A type, as you would use in a type annotation (see the Python typing module ) An instance of genai.types.Schema The dict equivalent of genai.types.Schema The easiest way to define a schema is with a Pydantic type (as shown in the previous example): Python config = { 'response_mime_type' : 'application/json' , 'response_schema' : list [ Recipe ]} When you use a Pydantic type, the Python library builds out a JSON schema for you and sends it to the API. For additional examples, see the Python library docs . The Python library supports schemas defined with the following types (where AllowedType is any allowed type): int float bool str list[AllowedType] AllowedType|AllowedType|... For structured types: dict[str, AllowedType] . This annotation declares all dict values to be the same type, but doesn't specify what keys should be included. User-defined Pydantic models . This approach lets you specify the key names and define different types for the values associated with each of the keys, including nested structures. JSON Schema support JSON Schema is a more recent specification than OpenAPI 3.0, which the Schema object is based on. Support for JSON Schema is available as a preview using the \ No newline at end of file diff --git a/docstore/38cc542d-244b-4b29-a5a6-9115a6a848cf b/docstore/38cc542d-244b-4b29-a5a6-9115a6a848cf new file mode 100644 index 0000000000000000000000000000000000000000..485847fd8e226bc46bd8d42c44cd3e8dd100fb7e --- /dev/null +++ b/docstore/38cc542d-244b-4b29-a5a6-9115a6a848cf @@ -0,0 +1 @@ +supported on the interactive (or non-batch mode) API. Pricing: Batch Mode usage is priced at 50% of the standard interactive API cost for the equivalent model. Service Level Objective (SLO): Batch jobs are designed to complete within a 24-hour turnaround time. Many jobs may complete much faster depending on their size and current system load. Caching: Context caching is enabled for batch requests. If a request in your batch results in a cache hit, the cached tokens are priced the same as for non-batch mode traffic. Best practices Use input files for large requests: For a large number of requests, always use the file input method for better manageability and to avoid hitting request size limits for the BatchGenerateContent call itself. Note that there's a the 2GB file size limit per input file. Error handling: Check the batchStats for failedRequestCount after a job completes. If using file output, parse each line to check if it's a GenerateContentResponse or a status object indicating an error for that specific request. Submit jobs once: The creation of a batch job is not idempotent. If you send the same creation request twice, two separate batch jobs will be created. Break up very large batches: While the target turnaround time is 24 hours, actual processing time can vary based on system load and job size. For large jobs, consider breaking them into smaller batches if intermediate results are needed sooner. What's next Check out the batch mode notebook for more examples. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/38d1dc87-f6ae-4904-ae5a-ee0095889b59 b/docstore/38d1dc87-f6ae-4904-ae5a-ee0095889b59 new file mode 100644 index 0000000000000000000000000000000000000000..6556f69284c91652e57b9c1d4b105e3f4aa02c59 --- /dev/null +++ b/docstore/38d1dc87-f6ae-4904-ae5a-ee0095889b59 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#live-api-2.0 Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/38eb1125-7a42-4375-b142-99512b739028 b/docstore/38eb1125-7a42-4375-b142-99512b739028 new file mode 100644 index 0000000000000000000000000000000000000000..40564cc3a339b41e3f9c5a2f24a7d0082d31abf9 --- /dev/null +++ b/docstore/38eb1125-7a42-4375-b142-99512b739028 @@ -0,0 +1 @@ +response_modalities = [ "AUDIO" ], context_window_compression = ( # Configures compression with default parameters. types . ContextWindowCompressionConfig ( sliding_window = types . SlidingWindow (), ) ), ) JavaScript const config = { responseModalities : [ Modality . AUDIO ], contextWindowCompression : { slidingWindow : {} } }; Session resumption To prevent session termination when the server periodically resets the WebSocket connection, configure the sessionResumption field within the setup configuration . Passing this configuration causes the server to send SessionResumptionUpdate messages, which can be used to resume the session by passing the last resumption token as the SessionResumptionConfig.handle of the subsequent connection. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" async def main (): print ( f "Connecting to the service with handle { previous_session_handle } ..." ) async with client . aio . live . connect ( model = model , config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], session_resumption = types . SessionResumptionConfig ( # The handle of the session to resume is passed here, # or else None to start a new session. handle = previous_session_handle ), ), ) as session : while True : await session . send_client_content ( turns = types . Content ( role = "user" , parts = [ types . Part ( text = "Hello world!" )] ) ) async for message in session . receive (): # Periodically, the server will send update messages that may # contain a handle for the current state of the session. if message . session_resumption_update : update = message . session_resumption_update if update . resumable and update . new_handle : # The handle should be retained and linked to the session. return update . new_handle # For the purposes of this example, placeholder input is continually fed # to the model. In non-sample code, the model inputs would come from # \ No newline at end of file diff --git a/docstore/38eb6a48-85d4-4de3-847c-f16147731b07 b/docstore/38eb6a48-85d4-4de3-847c-f16147731b07 new file mode 100644 index 0000000000000000000000000000000000000000..0d39ec227c9e73c3d3b5117917b80f593b500b4b --- /dev/null +++ b/docstore/38eb6a48-85d4-4de3-847c-f16147731b07 @@ -0,0 +1 @@ +async function main () { const ai = new GoogleGenAI ({}); const imageUrl = "https://goo.gle/instrument-img" ; const response = await fetch ( imageUrl ); const imageArrayBuffer = await response . arrayBuffer (); const base64ImageData = Buffer . from ( imageArrayBuffer ). toString ( 'base64' ); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ { inlineData : { mimeType : 'image/jpeg' , data : base64ImageData , }, }, { text : "Caption this image." } ], }); console . log ( result . text ); } main (); Go package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Download the image. imageResp , _ := http . Get ( "https://goo.gle/instrument-img" ) imageBytes , _ := io . ReadAll ( imageResp . Body ) parts := [] * genai . Part { genai . NewPartFromBytes ( imageBytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMG_URL = "https://goo.gle/instrument-img" MIME_TYPE = $( curl -sIL " $IMG_URL " | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1 ) if [[ -z " $MIME_TYPE " || ! " $MIME_TYPE " == image/* ]] ; then MIME_TYPE = "image/jpeg" fi # Check for macOS if [[ " $( uname ) " == "Darwin" ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -b 0 ) elif [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 ) else IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -w0 ) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ \ No newline at end of file diff --git a/docstore/38fd69a1-4aa8-4eba-a9c1-0aef2d7935e1 b/docstore/38fd69a1-4aa8-4eba-a9c1-0aef2d7935e1 new file mode 100644 index 0000000000000000000000000000000000000000..a99dcaa8b13f3b83504ad1b35c796f8fc297615c --- /dev/null +++ b/docstore/38fd69a1-4aa8-4eba-a9c1-0aef2d7935e1 @@ -0,0 +1 @@ +] # Execute the prompt with specified tools in audio modality await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Multiple tasks example - combining lights, code execution, and search const prompt = ` Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! ` ; const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } // not defined here. ]; // Execute the prompt with specified tools in audio modality await run ( prompt , { tools : tools , modality : "AUDIO" }); Python developers can try this out in the Live API Tool Use notebook . Model context protocol (MCP) Model Context Protocol (MCP) is an open standard for connecting AI applications with external tools and data. MCP provides a common protocol for models to access context, such as functions (tools), data sources (resources), or predefined prompts. The Gemini SDKs have built-in support for the MCP, reducing boilerplate code and offering automatic tool calling for MCP tools. When the model generates an MCP tool call, the Python and JavaScript client SDK can automatically execute the MCP tool and send the response back to the model in a subsequent request, continuing this loop until no more tool calls are made by the model. Here, you can find an example of how to use a local MCP server with Gemini and mcp SDK. Python Make sure the latest version of the mcp SDK is installed on your platform of choice. pip install mcp Note: Python supports automatic tool calling by passing in the ClientSession into the tools parameters. If you want to disable it, you can provide automatic_function_calling with disabled True . import os import asyncio from datetime import datetime from mcp import ClientSession , StdioServerParameters from \ No newline at end of file diff --git a/docstore/38ff7325-8ce8-4670-8a7c-23b050363ac2 b/docstore/38ff7325-8ce8-4670-8a7c-23b050363ac2 new file mode 100644 index 0000000000000000000000000000000000000000..1c3c1b9b46e1c38e34dd8cd82807f79c808d7249 --- /dev/null +++ b/docstore/38ff7325-8ce8-4670-8a7c-23b050363ac2 @@ -0,0 +1 @@ +sketches, to hyper-realistic digital art. For example, the following images use the same prompt with different styles: "An [art style or creation technique] of an angular sporty electric sedan with skyscrapers in the background" Prompt: A technical pencil drawing of an angular... Prompt: A charcoal drawing of an angular... Prompt: A color pencil drawing of an angular... Prompt: A pastel painting of an angular... Prompt: A digital art of an angular... Prompt: An art deco (poster) of an angular... Image source: Each image was generated using its corresponding text prompt with the Imagen 2 model. Shapes and materials Prompt includes: "...made of..." , "...in the shape of..." One of the strengths of this technology is that you can create imagery that is otherwise difficult or impossible. For example, you can recreate your company logo in different materials and textures. Prompt: a duffle bag made of cheese Prompt: neon tubes in the shape of a bird Prompt: an armchair made of paper , studio photo, origami style Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Historical art references Prompt includes: "...in the style of..." Certain styles have become iconic over the years. The following are some ideas of historical painting or art styles that you can try. "generate an image in the style of [art period or movement] : a wind farm" Prompt: generate an image in the style of an impressionist painting : a wind farm Prompt: generate an image in the style of a renaissance painting : a wind farm Prompt: generate an image in the style of pop art : a wind farm Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Image quality modifiers Certain keywords can let the model know that you're looking for a high-quality asset. Examples of quality modifiers include the following: General Modifiers - high-quality, beautiful, stylized Photos - 4K, HDR, Studio Photo Art, Illustration - by a \ No newline at end of file diff --git a/docstore/3905cb26-000f-46f4-82ee-38c701723bdf b/docstore/3905cb26-000f-46f4-82ee-38c701723bdf new file mode 100644 index 0000000000000000000000000000000000000000..5b31a2c588785b0dc19769f45b0589a09f2843d3 --- /dev/null +++ b/docstore/3905cb26-000f-46f4-82ee-38c701723bdf @@ -0,0 +1 @@ +world knowledge and reasoning. Seamlessly blending text and images is important. You want accurate visuals embedded within long text sequences. You want to edit images conversationally while maintaining context. Choose Imagen when: Image quality, photorealism, artistic detail, or specific styles (e.g., impressionism, anime) are top priorities. Performing specialized editing tasks like product background updates or image upscaling. Infusing branding, style, or generating logos and product designs. Imagen 4 should be your go-to model starting to generate images with Imagen. Choose Imagen 4 Ultra for advanced use-cases or when you need the best image quality. Note that Imagen 4 Ultra can only generate one image at a time. Imagen prompt guide This section of the Imagen guide shows you how modifying a text-to-image prompt can produce different results, along with examples of images you can create. Prompt writing basics Note: Maximum prompt length is 480 tokens. A good prompt is descriptive and clear, and makes use of meaningful keywords and modifiers. Start by thinking of your subject , context , and style . Image text: A sketch ( style ) of a modern apartment building ( subject ) surrounded by skyscrapers ( context and background ). Subject : The first thing to think about with any prompt is the subject : the object, person, animal, or scenery you want an image of. Context and background: Just as important is the background or context in which the subject will be placed. Try placing your subject in a variety of backgrounds. For example, a studio with a white background, outdoors, or indoor environments. Style: Finally, add the style of image you want. Styles can be general (painting, photograph, sketches) or very specific (pastel painting, charcoal drawing, isometric 3D). You can also combine styles. After you write a first version of your prompt, refine your prompt by adding more details until you get to the image that you want. Iteration is important. Start by \ No newline at end of file diff --git a/docstore/392257b3-9a6c-4177-ae49-5ba1eae26583 b/docstore/392257b3-9a6c-4177-ae49-5ba1eae26583 new file mode 100644 index 0000000000000000000000000000000000000000..f1254740842b565388da4397f822e545092a5b88 --- /dev/null +++ b/docstore/392257b3-9a6c-4177-ae49-5ba1eae26583 @@ -0,0 +1 @@ +NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) stream := chat . SendMessageStream ( ctx , genai . Part { Text : "How many paws are in my house?" }) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt = sse \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Supported models All models in the Gemini family support text generation. To learn more about the models \ No newline at end of file diff --git a/docstore/39339bc1-8923-4d3c-8764-203d5d4b2c24 b/docstore/39339bc1-8923-4d3c-8764-203d5d4b2c24 new file mode 100644 index 0000000000000000000000000000000000000000..35cb4724ff25e15eeabaea84a051d2545b73055e --- /dev/null +++ b/docstore/39339bc1-8923-4d3c-8764-203d5d4b2c24 @@ -0,0 +1 @@ +model to produce concise responses, you can include examples in the prompt that give preference to concise responses. The following prompt provides two examples that show preference to the shorter explanations. In the response, you can see that the examples guided the model to choose the shorter explanation ( Explanation2 ) as opposed to the longer explanation ( Explanation1 ) like it did previously. Prompt: Below are some examples showing a question, explanation, and answer format: Question: Why is the sky blue? Explanation1: The sky appears blue because of Rayleigh scattering, which causes shorter blue wavelengths of light to be scattered more easily than longer red wavelengths, making the sky look blue. Explanation2: Due to Rayleigh scattering effect. Answer: Explanation2 Question: What is the cause of earthquakes? Explanation1: Sudden release of energy in the Earth's crust. Explanation2: Earthquakes happen when tectonic plates suddenly slip or break apart, causing a release of energy that creates seismic waves that can shake the ground and cause damage. Answer: Explanation1 Now, Answer the following question given the example formats above: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Answer: Explanation2 (gemini-2.5-flash) Optimal number of examples Models like Gemini can often pick up on patterns using a few examples, though you may need to experiment with the number of examples to provide in the prompt for the best results. At the same time, if you include too many examples, the model may start to overfit the response to the examples. Patterns vs anti patterns Using examples to show the model a pattern to follow is more effective than using examples to show the model an anti pattern \ No newline at end of file diff --git a/docstore/3945ee50-3dd3-4bec-afa5-7998902f278c b/docstore/3945ee50-3dd3-4bec-afa5-7998902f278c new file mode 100644 index 0000000000000000000000000000000000000000..47c113cd9bacf84a434d531b6a240f0025b74130 --- /dev/null +++ b/docstore/3945ee50-3dd3-4bec-afa5-7998902f278c @@ -0,0 +1 @@ +. For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC. \ No newline at end of file diff --git a/docstore/3949fdb5-e6a7-425f-8d80-2fa9f41391be b/docstore/3949fdb5-e6a7-425f-8d80-2fa9f41391be new file mode 100644 index 0000000000000000000000000000000000000000..39af00a0c9a9d748f5b17ca6db1ebff5b823541e --- /dev/null +++ b/docstore/3949fdb5-e6a7-425f-8d80-2fa9f41391be @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " Delete uploaded files Files are automatically deleted after 48 hours. You can also manually delete an uploaded file: Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) client . files . delete ( name = myfile . name ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; await ai . files . delete ({ name : fileName }); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } client . DeleteFile ( ctx , file . Name ) REST curl --request "DELETE" https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " Usage info You can use the Files API to upload and interact with media files. The Files API lets you store up to 20 GB of files per project, with a per-file maximum size of 2 GB. Files are stored for 48 hours. During that time, you can use the API to get metadata about the files, but you can't download the files. The Files API is available at no cost in all regions where the Gemini API is available. File prompting strategies This section provides guidance and best practices for using media files with prompts for the Gemini API. Being able to use various types of data in your prompts gives you more flexibility in terms of what tasks you can tackle with the Gemini API. For example, you can send the model a photo of a delicious meal and ask it to write a short blog about the meal. Prompt Response Write a short, engaging blog post based on this picture. It should include a description of the meal in the photo and talk about my journey meal prepping. Meal prepping is a great way to save time and money, and it can also help you to eat healthier. This meal is a great example of a healthy and delicious meal that can be easily prepped ahead of time. This \ No newline at end of file diff --git a/docstore/395a80db-79bb-4cf1-b953-5b8f78dd0dc5 b/docstore/395a80db-79bb-4cf1-b953-5b8f78dd0dc5 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/395a80db-79bb-4cf1-b953-5b8f78dd0dc5 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/3992d071-d68c-4665-bd8a-7b0dabfcc940 b/docstore/3992d071-d68c-4665-bd8a-7b0dabfcc940 new file mode 100644 index 0000000000000000000000000000000000000000..12eaa70edcc32ba40cc62f56233fa0adea94b474 --- /dev/null +++ b/docstore/3992d071-d68c-4665-bd8a-7b0dabfcc940 @@ -0,0 +1 @@ +Gemini API libraries | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini API libraries When building with the Gemini API, we recommend using our official collection of libraries across major languages: the Google GenAI SDK . They are production ready under General Availability . Our samples and documentation across this site are built using these libraries. Note: If you're using one of our legacy libraries, we strongly recommend you migrate to the Google GenAI SDK. Review the legacy libraries section for more information. If you're new to the Gemini API, follow our quickstart guide to get started. Language support and installation The Google GenAI SDK is available for the Python, JavaScript/TypeScript, Go and Java languages. You can install each language's library using package managers, or visit their GitHub repos for further engagement: Python Library: google-genai GitHub Repository: googleapis/python-genai Installation: pip install google-genai JavaScript Library: @google/genai GitHub Repository: googleapis/js-genai Installation: npm install @google/genai Go Library: google.golang.org/genai GitHub Repository: googleapis/go-genai Installation: go get google.golang.org/genai Java Library: google-genai GitHub Repository: googleapis/java-genai Installation: If you're using Maven, add the following to your dependencies: com.google.genai google-genai 1.0.0 General availability We started rolling out the Google GenAI SDK in late 2024. As of May 2025, it reached General Availability (GA) across all supported platforms. This means \ No newline at end of file diff --git a/docstore/3994474f-81a0-4f2d-bacd-0ba84d501328 b/docstore/3994474f-81a0-4f2d-bacd-0ba84d501328 new file mode 100644 index 0000000000000000000000000000000000000000..6753b1d43ae21393f7ea777aad5afc4becddf540 --- /dev/null +++ b/docstore/3994474f-81a0-4f2d-bacd-0ba84d501328 @@ -0,0 +1 @@ +response will include a thought_signature field containing an encrypted representation of the model's reasoning. Return the signature: When you send the function's execution result back to the server, include the thought_signature you received. This allows the model to restore its previous thinking context and will likely result in better function calling performance. Receiving signatures from the server Signatures are returned in the part after the model's thinking phase, which typically is a text or function call. Here are some examples of what thought signatures look like returned in each type of part, in response to the request "What's the weather in Lake Tahoe?" using the Get Weather example: Text part [{ "candidates" : [ { "content" : { "parts" : [ { "text" : "Here's what the weather in Lake Tahoe is today" , "thoughtSignature" : "ClcBVKhc7ru7KzUI7SrdUoIdAYLm/+i93aHjfIt4xHyAoO/G70tApxnK2ujBhOhC1PrRy1pkQa88fqFvpHNVd1HDjNLO7mkp6/hFwE+SPPEB3fh0hs4oM8MKhgIBVKhc7uIGvrS7i/T4HpfbnYrluFfWNjZ62gewqe4cVdR/Dlh+zbjtYmDD0gPZ+SuBO7vvHQdzsjePRP+2Y5XddX6LEf/cGGgakq8EhVvw/a6IVzUO6XmpHg2Ag1sl8E9+VFH/lC0R0ZuYdFWligtDuYwp5p5q3o59G0TtWeU2MC1y2MJfE9u/KWd313ldka80/X2W/xF2O/4djMp5G2WKcULfve75zeRCy0mc5iS3SB9mTH0cT6x0vtKjeBx50gcg+CQWtJcRuwTVzz54dmvmK9xvnqA8gKGw3DuaM9wfy5hyY7Qg0z3iyyWdP8T/lbjKim8IEQOk7O1vVwP1Ko7oMYH8JgA1CsoBAVSoXO6v4c5RSyd1cn6EIU0pEFQsjW7rYWPuZdOFq/tsGJT9BCfW7KGkPGwlNSq8jTJFvbcJ/DjtndISQYXwiXd2kGa5JfdS2Kh4zOxCxiWtOk+2nCc3+XQk2nonhO+esGJpkDdbbHZSqRgcUtYKq7q28iPFOQvOFyCiZNB7K86Z/6Hnagu2snSlN/BcTMaFGaWpcCClSUo4foRZn3WbNCoM8rcpD7qEJMp4a5baaSxyyeL1ZTGd2HLpFys/oiW6e3oAnhxuIysCwg==" } ] , "role" : "model" } , "index" : 0 } ] , # Remainder of response... Function call part [{ "candidates" : [ { "content" : { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/39959db5-0b6c-4eba-a8da-2fff5a239581 b/docstore/39959db5-0b6c-4eba-a8da-2fff5a239581 new file mode 100644 index 0000000000000000000000000000000000000000..10c56dda4e771cbe191acbd7eaea4d6ff44484f5 --- /dev/null +++ b/docstore/39959db5-0b6c-4eba-a8da-2fff5a239581 @@ -0,0 +1 @@ +which you can get in Google AI Studio . base_url="https://generativelanguage.googleapis.com/v1beta/openai/" : This tells the OpenAI library to send requests to the Gemini API endpoint instead of the default URL. model="gemini-2.0-flash" : Choose a compatible Gemini model Thinking Gemini 2.5 models are trained to think through complex problems, leading to significantly improved reasoning. The Gemini API comes with a "thinking budget" parameter which gives fine grain control over how much the model will think. Unlike the Gemini API, the OpenAI API offers three levels of thinking control: "low" , "medium" , and "high" , which map to 1,024, 8,192, and 24,576 tokens, respectively. If you want to disable thinking, you can set reasoning_effort to "none" (note that reasoning cannot be turned off for 2.5 Pro models). Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.5-flash" , reasoning_effort = "low" , messages = [ { "role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Explain to me how AI works" } ] ) print ( response . choices [ 0 ] . message ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); const response = await openai . chat . completions . create ({ model : "gemini-2.5-flash" , reasoning_effort : "low" , messages : [ { role : "system" , content : "You are a helpful assistant." }, { role : "user" , content : "Explain to me how AI works" , }, ], }); console . log ( response . choices [ 0 ]. message ); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.5-flash", "reasoning_effort": "low", \ No newline at end of file diff --git a/docstore/39a67908-556c-43d7-83c8-5f2692c2676d b/docstore/39a67908-556c-43d7-83c8-5f2692c2676d new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/39a67908-556c-43d7-83c8-5f2692c2676d @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/39bd5682-5777-4027-a431-9c3cf3452413 b/docstore/39bd5682-5777-4027-a431-9c3cf3452413 new file mode 100644 index 0000000000000000000000000000000000000000..a6b24f2239a055f1d65a9fc8ecf3b6c3213634db --- /dev/null +++ b/docstore/39bd5682-5777-4027-a431-9c3cf3452413 @@ -0,0 +1 @@ +min | 12/2/2023 | 12/4/2023 | | Physics | In progress | 2 hrs | 12/3/2023 | 12/5/2023 | | Biology | Not started | 6 hrs | 11/2/2016 | 12/6/2023 | Prompt Model response Provide a list of all the following attributes: ingredients, type of cuisine, vegetarian or not, in JSON format ```json { "ingredients": [ "rice", "shrimp", "clams", "mussels", "peas", "tomatoes", "onions", "garlic", "olive oil", "paprika", "salt", "pepper" ], "type of cuisine": "Spanish", "vegetarian": false } ``` Put your image first for single-image prompts While Gemini can interpret image and text in any order within a prompt, placing a single image before the text prompt might lead to better results. In the following samples, you'll notice the image comes first before the text in each prompt. Troubleshooting your multimodal prompt You might need to troubleshoot your prompt if you are not getting a helpful response. Here are a few strategies you could try. If the model is not drawing information from the relevant part of the image To get a more specific response, you can point out which aspects of the image you want the prompt to draw information from to inform its response. Prompt Model response How many days will these diapers last a baby? The diapers will last for 198 days before they run out. Updated prompt Improved response How long will these diapers last before I run out? Use the weight shown on the box to determine the child's age, and use the total number of diapers in the box. Divide the total number by how many diapers the child goes through per day. The box contains 198 diapers. The box states the diapers are for a child weighing 22-37 lbs. The average child in this weight range is likely between 1 and 2 years old. A 2-year-old typically goes through 6 diapers per day, therefore, the diapers will last around 33 days (198/6). If the model output is too generic and not tailored enough to the image input To help the model tailor its response to the image(s), try asking it to describe the \ No newline at end of file diff --git a/docstore/39e5a5e4-cbf2-4e3a-bb9f-f234f631d5b1 b/docstore/39e5a5e4-cbf2-4e3a-bb9f-f234f631d5b1 new file mode 100644 index 0000000000000000000000000000000000000000..b9aae1f02a8caa7a25135d3bec800921c05dfc11 --- /dev/null +++ b/docstore/39e5a5e4-cbf2-4e3a-bb9f-f234f631d5b1 @@ -0,0 +1 @@ +( response . choices [ 0 ] . message . content ) JavaScript import fs from "fs" ; import OpenAI from "openai" ; const client = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); const audioFile = fs . readFileSync ( "/path/to/your/audio/file.wav" ); const base64Audio = Buffer . from ( audioFile ). toString ( "base64" ); async function main () { const response = await client . chat . completions . create ({ model : "gemini-2.0-flash" , messages : [ { role : "user" , content : [ { type : "text" , text : "Transcribe this audio" , }, { type : "input_audio" , input_audio : { data : base64Audio , format : "wav" , }, }, ], }, ], }); console . log ( response . choices [ 0 ]. message . content ); } main (); REST Note: If you get an Argument list too long error, the encoding of your audio file might be too long for curl. bash -c ' base64_audio=$(base64 -i "/path/to/your/audio/file.wav"); curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d "{ \"model\": \"gemini-2.0-flash\", \"messages\": [ { \"role\": \"user\", \"content\": [ { \"type\": \"text\", \"text\": \"Transcribe this audio file.\" }, { \"type\": \"input_audio\", \"input_audio\": { \"data\": \"${base64_audio}\", \"format\": \"wav\" } } ] } ] }" ' Structured output Gemini models can output JSON objects in any structure you define . Python from pydantic import BaseModel from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) class CalendarEvent ( BaseModel ): name : str date : str participants : list [ str ] completion = client . beta . chat . completions . parse ( model = "gemini-2.0-flash" , messages = [ { "role" : "system" , "content" : "Extract the event information." }, { "role" : "user" , "content" : "John and Susan are going to an AI conference on \ No newline at end of file diff --git a/docstore/39e5bcc2-2558-4bc0-bebf-2da2ea8f962b b/docstore/39e5bcc2-2558-4bc0-bebf-2da2ea8f962b new file mode 100644 index 0000000000000000000000000000000000000000..ddc1ec68807ed0017d00c5153db6b826d6e2aced --- /dev/null +++ b/docstore/39e5bcc2-2558-4bc0-bebf-2da2ea8f962b @@ -0,0 +1 @@ +"GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); async function main () { const model = await openai . models . retrieve ( "gemini-2.0-flash" ); console . log ( model . id ); } main (); REST curl https://generativelanguage.googleapis.com/v1beta/openai/models/gemini-2.0-flash \ -H "Authorization: Bearer GEMINI_API_KEY" Current limitations Support for the OpenAI libraries is still in beta while we extend feature support. If you have questions about supported parameters, upcoming features, or run into any issues getting started with Gemini, join our Developer Forum . What's next Try our OpenAI Compatibility Colab to work through more detailed examples. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-18 UTC. \ No newline at end of file diff --git a/docstore/39eedc29-153e-459f-aeb7-f9d7cf3bec44 b/docstore/39eedc29-153e-459f-aeb7-f9d7cf3bec44 new file mode 100644 index 0000000000000000000000000000000000000000..8ae055ee25ee10e0bf5368b2d5c01f7fd2abd6b2 --- /dev/null +++ b/docstore/39eedc29-153e-459f-aeb7-f9d7cf3bec44 @@ -0,0 +1 @@ +model can then retrieve content from the URLs and use that content to inform and shape its response. You can try examples of using tools with thinking models in the Thinking cookbook . What's next? To work through more in depth examples, like: Using tools with thinking Streaming with thinking Adjusting the thinking budget for different results and more, try our Thinking cookbook . Thinking coverage is now available in our OpenAI Compatibility guide. For more info about Gemini 2.5 Pro, Gemini Flash 2.5, and Gemini 2.5 Flash-Lite, visit the model page . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-11 UTC. \ No newline at end of file diff --git a/docstore/39f11906-7c64-463e-b8d9-806be66fdab6 b/docstore/39f11906-7c64-463e-b8d9-806be66fdab6 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/39f11906-7c64-463e-b8d9-806be66fdab6 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/39fe74d3-dddb-4509-9a3c-789ade3ec83c b/docstore/39fe74d3-dddb-4509-9a3c-789ade3ec83c new file mode 100644 index 0000000000000000000000000000000000000000..01c5ad2134e900af7bebbd1821490393529927e8 --- /dev/null +++ b/docstore/39fe74d3-dddb-4509-9a3c-789ade3ec83c @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/billing#request-an-upgrade Title: Billing | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/3a0a299e-9339-4263-9ddf-2dd2424cd943 b/docstore/3a0a299e-9339-4263-9ddf-2dd2424cd943 new file mode 100644 index 0000000000000000000000000000000000000000..56f7dcc9bceb6e3744499a44a4d77c57b76426e8 --- /dev/null +++ b/docstore/3a0a299e-9339-4263-9ddf-2dd2424cd943 @@ -0,0 +1 @@ += "gemini-2.5-flash" , contents = [ "Explain how AI works" ], config = types . GenerateContentConfig ( temperature = 0.1 ) ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , config : { temperature : 0.1 , }, }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } temp := float32 ( 0.9 ) topP := float32 ( 0.5 ) topK := float32 ( 20.0 ) config := & genai . GenerateContentConfig { Temperature : & temp , TopP : & topP , TopK : & topK , ResponseMIMEType : "application/json" , } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "What is the average size of a swallow?" ), config , ) fmt . Println ( result . Text ()) } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ], "generationConfig": { "stopSequences": [ "Title" ], "temperature": 1.0, "topP": 0.8, "topK": 10 } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const generationConfig = { temperature : 1 , topP : 0.95 , topK : 40 , responseMimeType : 'text/plain' , }; const payload = { generationConfig , contents : [ { parts : [ { text : 'Explain how AI works in a few words' }, ], }, ], }; const url = \ No newline at end of file diff --git a/docstore/3a63fd14-74ba-4d25-9e63-7ce00acd996a b/docstore/3a63fd14-74ba-4d25-9e63-7ce00acd996a new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/3a63fd14-74ba-4d25-9e63-7ce00acd996a @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/3a714e7d-0b21-4050-8da6-34f227d84701 b/docstore/3a714e7d-0b21-4050-8da6-34f227d84701 new file mode 100644 index 0000000000000000000000000000000000000000..964387d039ef80cfbc57b203b39f125290e566d4 --- /dev/null +++ b/docstore/3a714e7d-0b21-4050-8da6-34f227d84701 @@ -0,0 +1 @@ +using the text below. Respond with only the text provided. Question: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Text: Color: Slowly pulsing yellow What it means: There is a network error. What to do: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. Color: Fast blinking yellow What it means: You are holding down the reset button and are factory resetting this device. What to do: If you keep holding down the reset button, after about 12 seconds, the light will turn solid yellow. Once it is solid yellow, let go of the factory reset button. Color: Solid yellow What it means: Router is factory resetting. What to do: This can take up to 10 minutes. When it's done, the device will reset itself and start pulsing white, letting you know it's ready for setup. Color: Solid red What it means: Something is wrong. What to do: Critical failure. Factory reset the router. If the light stays red, contact Wifi customer support. Response: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. (gemini-2.5-flash) Add prefixes A prefix is a word or phrase that you add to the prompt content that can serve several purposes, depending on where you put the prefix: Input prefix: Adding a prefix to the input signals semantically meaningful parts of the input to the model. For example, the prefixes "English:" and "French:" demarcate two different languages. Output prefix: Even though the output is generated by the model, you can add a prefix for the output in the prompt. The output prefix gives the model information about what's expected as a response. For example, the output prefix "JSON:" signals to the model that the output should be in JSON format. Example prefix: In few-shot prompts, adding prefixes \ No newline at end of file diff --git a/docstore/3a883e58-4ea2-4801-b8ff-4f5d90b3ffde b/docstore/3a883e58-4ea2-4801-b8ff-4f5d90b3ffde new file mode 100644 index 0000000000000000000000000000000000000000..b7901babd54dbafa9f145cdbd26424334c4f88eb --- /dev/null +++ b/docstore/3a883e58-4ea2-4801-b8ff-4f5d90b3ffde @@ -0,0 +1 @@ +upload ( file = doc_data_2 , config = dict ( mime_type = 'application/pdf' ) ) prompt = "What is the difference between each of the main benchmarks between these two papers? Output these in a table." response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ sample_pdf_1 , sample_pdf_2 , prompt ]) print ( response . text ) JavaScript import { createPartFromUri , GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); async function uploadRemotePDF ( url , displayName ) { const pdfBuffer = await fetch ( url ) . then (( response ) = > response . arrayBuffer ()); const fileBlob = new Blob ([ pdfBuffer ], { type : 'application/pdf' }); const file = await ai . files . upload ({ file : fileBlob , config : { displayName : displayName , }, }); // Wait for the file to be processed. let getFile = await ai . files . get ({ name : file . name }); while ( getFile . state === 'PROCESSING' ) { getFile = await ai . files . get ({ name : file . name }); console . log ( `current file status: ${ getFile . state } ` ); console . log ( 'File is still processing, retrying in 5 seconds' ); await new Promise (( resolve ) = > { setTimeout ( resolve , 5000 ); }); } if ( file . state === 'FAILED' ) { throw new Error ( 'File processing failed.' ); } return file ; } async function main () { const content = [ 'What is the difference between each of the main benchmarks between these two papers? Output these in a table.' , ]; let file1 = await uploadRemotePDF ( "https://arxiv.org/pdf/2312.11805" , "PDF 1" ) if ( file1 . uri && file1 . mimeType ) { const fileContent = createPartFromUri ( file1 . uri , file1 . mimeType ); content . push ( fileContent ); } let file2 = await uploadRemotePDF ( "https://arxiv.org/pdf/2403.05530" , "PDF 2" ) if ( file2 . uri && file2 . mimeType ) { const fileContent = createPartFromUri ( file2 . uri , file2 . mimeType ); content . push ( fileContent ); } const response = await ai . models . \ No newline at end of file diff --git a/docstore/3a95dfd1-1323-42c2-a4bc-0fa1ff2249cf b/docstore/3a95dfd1-1323-42c2-a4bc-0fa1ff2249cf new file mode 100644 index 0000000000000000000000000000000000000000..41dedb01cb0b9c984f39578d0001dc7776e6fe12 --- /dev/null +++ b/docstore/3a95dfd1-1323-42c2-a4bc-0fa1ff2249cf @@ -0,0 +1 @@ +, 'IMAGE' ] ) ) for part in response . candidates [ 0 ] . content . parts : if part . text is not None : print ( part . text ) elif part . inline_data is not None : image = Image . open ( BytesIO (( part . inline_data . data ))) image . show () JavaScript Note: We've released the Google SDK for TypeScript and JavaScript in preview launch stage . Use this SDK for image generation features. import { GoogleGenAI , Modality } from "@google/genai" ; import * as fs from "node:fs" ; async function main () { const ai = new GoogleGenAI ({}); // Load the image from the local file system const imagePath = "path/to/image.png" ; const imageData = fs . readFileSync ( imagePath ); const base64Image = imageData . toString ( "base64" ); // Prepare the content parts const contents = [ { text : "Can you add a llama next to the image?" }, { inlineData : { mimeType : "image/png" , data : base64Image , }, }, ]; // Set responseModalities to include "Image" so the model can generate an image const response = await ai . models . generateContent ({ model : "gemini-2.0-flash-preview-image-generation" , contents : contents , config : { responseModalities : [ Modality . TEXT , Modality . IMAGE ], }, }); for ( const part of response . candidates [ 0 ]. content . parts ) { // Based on the part type, either show the text or save the image if ( part . text ) { console . log ( part . text ); } else if ( part . inlineData ) { const imageData = part . inlineData . data ; const buffer = Buffer . from ( imageData , "base64" ); fs . writeFileSync ( "gemini-native-image.png" , buffer ); console . log ( "Image saved as gemini-native-image.png" ); } } } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imagePath := "/path/to/image.png" imgData , _ := os . ReadFile ( imagePath ) parts := [] * genai . Part { genai . NewPartFromText ( "Hi, This is \ No newline at end of file diff --git a/docstore/3aa243e4-bd73-412b-989a-780bcbd6c01d b/docstore/3aa243e4-bd73-412b-989a-780bcbd6c01d new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/3aa243e4-bd73-412b-989a-780bcbd6c01d @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/3aa47beb-75af-491e-bbe5-15d6b896a9db b/docstore/3aa47beb-75af-491e-bbe5-15d6b896a9db new file mode 100644 index 0000000000000000000000000000000000000000..cb7319bb995c708a315638a1177ba448f5c39210 --- /dev/null +++ b/docstore/3aa47beb-75af-491e-bbe5-15d6b896a9db @@ -0,0 +1 @@ +: "Powers the spinning disco ball." , "parameters" : { "type" : "object" , "properties" : { "power" : { "type" : "boolean" , "description" : "Whether to turn the disco ball on or off." , } }, "required" : [ "power" ], }, } start_music = { "name" : "start_music" , "description" : "Play some music matching the specified parameters." , "parameters" : { "type" : "object" , "properties" : { "energetic" : { "type" : "boolean" , "description" : "Whether the music is energetic or not." , }, "loud" : { "type" : "boolean" , "description" : "Whether the music is loud or not." , }, }, "required" : [ "energetic" , "loud" ], }, } dim_lights = { "name" : "dim_lights" , "description" : "Dim the lights." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "number" , "description" : "The brightness of the lights, 0.0 is off, 1.0 is full." , } }, "required" : [ "brightness" ], }, } JavaScript import { Type } from '@google/genai' ; const powerDiscoBall = { name : 'power_disco_ball' , description : 'Powers the spinning disco ball.' , parameters : { type : Type . OBJECT , properties : { power : { type : Type . BOOLEAN , description : 'Whether to turn the disco ball on or off.' } }, required : [ 'power' ] } }; const startMusic = { name : 'start_music' , description : 'Play some music matching the specified parameters.' , parameters : { type : Type . OBJECT , properties : { energetic : { type : Type . BOOLEAN , description : 'Whether the music is energetic or not.' }, loud : { type : Type . BOOLEAN , description : 'Whether the music is loud or not.' } }, required : [ 'energetic' , 'loud' ] } }; const dimLights = { name : 'dim_lights' , description : 'Dim the lights.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'The brightness of the lights, 0.0 is off, 1.0 is full.' } }, required : [ 'brightness' ] } }; Configure the function calling mode to allow using all of the specified tools. To learn more, \ No newline at end of file diff --git a/docstore/3ab347da-1338-491f-8f7a-d8f682df6b6d b/docstore/3ab347da-1338-491f-8f7a-d8f682df6b6d new file mode 100644 index 0000000000000000000000000000000000000000..4635be3508ca0afda1003e879ec15abc83b5d7a7 --- /dev/null +++ b/docstore/3ab347da-1338-491f-8f7a-d8f682df6b6d @@ -0,0 +1 @@ +NewPartFromText ( "Summarize this document" ), } contents := [] * genai . Content { genai . NewContentFromParts ( promptParts , genai . RoleUser ), // Specify role } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST PDF_PATH = "https://www.nasa.gov/wp-content/uploads/static/history/alsj/a17/A17_FlightPlan.pdf" DISPLAY_NAME = "A17_FlightPlan" PROMPT = "Summarize this document" # Download the PDF from the provided URL wget -O " ${ DISPLAY_NAME } .pdf" " ${ PDF_PATH } " MIME_TYPE = $( file -b --mime-type " ${ DISPLAY_NAME } .pdf" ) NUM_BYTES = $( wc -c < " ${ DISPLAY_NAME } .pdf" ) echo "MIME_TYPE: ${ MIME_TYPE } " echo "NUM_BYTES: ${ NUM_BYTES } " tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files?key= ${ GOOGLE_API_KEY } " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ DISPLAY_NAME } .pdf" 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo "file_uri: ${ file_uri } " # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key= $GOOGLE_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "' $PROMPT '"}, \ No newline at end of file diff --git a/docstore/3ae3a1f5-f54b-438c-b586-c68aea9161a2 b/docstore/3ae3a1f5-f54b-438c-b586-c68aea9161a2 new file mode 100644 index 0000000000000000000000000000000000000000..9670cc28c51e28ab1129c38367a73e00f33ff60c --- /dev/null +++ b/docstore/3ae3a1f5-f54b-438c-b586-c68aea9161a2 @@ -0,0 +1 @@ +meal features brown rice, roasted vegetables, and chicken teriyaki. The brown rice is a healthy and complex carbohydrate that will give you sustained energy. The roasted vegetables are a great way to get your daily dose of vitamins and minerals, and the chicken teriyaki is a delicious and protein-rich option. This meal is also very easy to prepare. Simply cook the brown rice, roast the vegetables, and cook the chicken teriyaki. Once everything is cooked, divide it into meal prep containers and store them in the refrigerator. You can then grab a container and go on busy mornings! If you're looking for a healthy and delicious meal that can be easily prepped ahead of time, this meal is a great option. It's packed with nutrients and flavor, and it's sure to keep you feeling full and satisfied. Here's to healthy and delicious meal prepping! If you are having trouble getting the output you want from prompts that use media files, there are some strategies that can help you get the results you want. The following sections provide design approaches and troubleshooting tips for improving prompts that use multimodal input. You can improve your multimodal prompts by following these best practices: Prompt design fundamentals Be specific in your instructions : Craft clear and concise instructions that leave minimal room for misinterpretation. Add a few examples to your prompt: Use realistic few-shot examples to illustrate what you want to achieve. Break it down step-by-step : Divide complex tasks into manageable sub-goals, guiding the model through the process. Specify the output format : In your prompt, ask for the output to be in the format you want, like markdown, JSON, HTML and more. Put your image first for single-image prompts : While Gemini can handle image and text inputs in any order, for prompts containing a single image, it might perform better if that image (or video) is placed before the text prompt. However, for prompts that require images to be highly interleaved \ No newline at end of file diff --git a/docstore/3b051373-30da-4cec-a8e1-a5618629a9a7 b/docstore/3b051373-30da-4cec-a8e1-a5618629a9a7 new file mode 100644 index 0000000000000000000000000000000000000000..43d235dee43f472c269052714c9705874463b9cd --- /dev/null +++ b/docstore/3b051373-30da-4cec-a8e1-a5618629a9a7 @@ -0,0 +1 @@ +tools ]) # Define user prompt contents = [ types . Content ( role = "user" , parts = [ types . Part ( text = "Turn the lights down to a romantic level" )] ) ] # Send request with function declarations response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = contents config = config , ) print ( response . candidates [ 0 ] . content . parts [ 0 ] . function_call ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Generation config with function declaration const config = { tools : [{ functionDeclarations : [ setLightValuesFunctionDeclaration ] }] }; // Configure the client const ai = new GoogleGenAI ({}); // Define user prompt const contents = [ { role : 'user' , parts : [{ text : 'Turn the lights down to a romantic level' }] } ]; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( response . functionCalls [ 0 ]); The model then returns a functionCall object in an OpenAPI compatible schema specifying how to call one or more of the declared functions in order to respond to the user's question. Python id = None args = { 'color_temp' : 'warm' , 'brightness' : 25 } name = 'set_light_values' JavaScript { name : 'set_light_values' , args : { brightness : 25 , color_temp : 'warm' } } Step 3: Execute set_light_values function code Extract the function call details from the model's response, parse the arguments , and execute the set_light_values function. Python # Extract tool call details, it may not be in the first part. tool_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call if tool_call . name == "set_light_values" : result = set_light_values ( ** tool_call . args ) print ( f "Function execution result: { result } " ) JavaScript // Extract tool call details const tool_call = response . functionCalls [ 0 ] let result ; if ( tool_call . name === 'set_light_values' ) { result = \ No newline at end of file diff --git a/docstore/3b0d025e-cce1-47ea-84d9-7c9a7680d9c2 b/docstore/3b0d025e-cce1-47ea-84d9-7c9a7680d9c2 new file mode 100644 index 0000000000000000000000000000000000000000..9dd2717ee06a5cb6666e2976e86c3492640fe1f5 --- /dev/null +++ b/docstore/3b0d025e-cce1-47ea-84d9-7c9a7680d9c2 @@ -0,0 +1 @@ +'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Refer to the GenerateContentConfig in our API reference for a complete list of configurable parameters and their descriptions. Multimodal inputs The Gemini API supports multimodal inputs, allowing you to combine text with media files. The following example demonstrates providing an image: Python from PIL import Image from google import genai client = genai . Client () image = Image . open ( "/path/to/organ.png" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , "Tell me about this instrument" ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const image = await ai . files . upload ({ file : "/path/to/organ.png" , }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ createUserContent ([ "Tell me about this instrument" , createPartFromUri ( image . uri , image . mimeType ), ]), ], }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imagePath := "/path/to/organ.jpg" imgData , _ := os . ReadFile ( imagePath ) parts := [] * genai . Part { genai . NewPartFromText ( "Tell me about this instrument" ), & genai . Part { InlineData : & genai . Blob { MIMEType : "image/jpeg" , Data : imgData , \ No newline at end of file diff --git a/docstore/3b19cca0-41fd-44e0-a2b3-7401c53dcf10 b/docstore/3b19cca0-41fd-44e0-a2b3-7401c53dcf10 new file mode 100644 index 0000000000000000000000000000000000000000..65512c01f777d6e5e3b37a55514cc97b01015837 --- /dev/null +++ b/docstore/3b19cca0-41fd-44e0-a2b3-7401c53dcf10 @@ -0,0 +1 @@ +Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Affective dialog This feature lets Gemini adapt its response style to the input expression and tone. To use affective dialog, set the api version to v1alpha and set enable_affective_dialog to true in the setup message: Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], enable_affective_dialog = True ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], enableAffectiveDialog : true }; Note that affective dialog is currently only supported by the native audio output models. Proactive audio When this feature is enabled, Gemini can proactively decide not to respond if the content is not relevant. To use it, set the api version to v1alpha and configure the proactivity field in the setup message and set proactive_audio to true : Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], proactivity = { 'proactive_audio' : True } ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], proactivity : { proactiveAudio : true } } Note that proactive audio is currently only supported by the native audio output models. Native audio output with thinking Native audio output supports thinking capabilities , available via a separate model gemini-2.5-flash-exp-native-audio-thinking-dialog . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-exp-native-audio-thinking-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio \ No newline at end of file diff --git a/docstore/3b1f69ff-5f79-4234-9327-45322463336f b/docstore/3b1f69ff-5f79-4234-9327-45322463336f new file mode 100644 index 0000000000000000000000000000000000000000..6a8a77c3ec0b2f12317f225d20ed3ea5b03e9f67 --- /dev/null +++ b/docstore/3b1f69ff-5f79-4234-9327-45322463336f @@ -0,0 +1 @@ +JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); const prompt = "Write a story about a magic backpack." ; const result = await model . generateContentStream ( prompt ); // Print text as it comes in. for await ( const chunk of result . stream ) { const chunkText = chunk . text (); process . stdout . write ( chunkText ); } Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( "gemini-1.5-flash" ) iter := model . GenerateContentStream ( ctx , genai . Text ( "Write a story about a magic backpack." )) for { resp , err := iter . Next () if err == iterator . Done { break } if err != nil { log . Fatal ( err ) } printResponse ( resp ) // utility for printing the response } After Python from google import genai client = genai . Client () for chunk in client . models . generate_content_stream ( model = 'gemini-2.0-flash' , contents = 'Tell me a story in 300 words.' ): print ( chunk . text ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContentStream ({ model : "gemini-2.0-flash" , contents : "Write a story about a magic backpack." , }); let text = "" ; for await ( const chunk of response ) { console . log ( chunk . text ); text += chunk . text ; } Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } for result , err := range client . Models . GenerateContentStream ( ctx , "gemini-2.0-flash" , genai . Text ( "Write a story about a magic backpack." ), nil , ) { if err != nil { log . Fatal ( err ) } fmt . Print ( result . Candidates [ 0 ]. Content . Parts [ 0 ]. Text ) } Configuration \ No newline at end of file diff --git a/docstore/3b229c56-2c70-47af-a73e-630aa52ab931 b/docstore/3b229c56-2c70-47af-a73e-630aa52ab931 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/3b229c56-2c70-47af-a73e-630aa52ab931 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/3b78f4cc-f99b-4592-be5a-8efa07420b90 b/docstore/3b78f4cc-f99b-4592-be5a-8efa07420b90 new file mode 100644 index 0000000000000000000000000000000000000000..39af00a0c9a9d748f5b17ca6db1ebff5b823541e --- /dev/null +++ b/docstore/3b78f4cc-f99b-4592-be5a-8efa07420b90 @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " Delete uploaded files Files are automatically deleted after 48 hours. You can also manually delete an uploaded file: Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) client . files . delete ( name = myfile . name ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; await ai . files . delete ({ name : fileName }); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } client . DeleteFile ( ctx , file . Name ) REST curl --request "DELETE" https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " Usage info You can use the Files API to upload and interact with media files. The Files API lets you store up to 20 GB of files per project, with a per-file maximum size of 2 GB. Files are stored for 48 hours. During that time, you can use the API to get metadata about the files, but you can't download the files. The Files API is available at no cost in all regions where the Gemini API is available. File prompting strategies This section provides guidance and best practices for using media files with prompts for the Gemini API. Being able to use various types of data in your prompts gives you more flexibility in terms of what tasks you can tackle with the Gemini API. For example, you can send the model a photo of a delicious meal and ask it to write a short blog about the meal. Prompt Response Write a short, engaging blog post based on this picture. It should include a description of the meal in the photo and talk about my journey meal prepping. Meal prepping is a great way to save time and money, and it can also help you to eat healthier. This meal is a great example of a healthy and delicious meal that can be easily prepped ahead of time. This \ No newline at end of file diff --git a/docstore/3b885180-0878-4bc4-8579-e903bba4e2a9 b/docstore/3b885180-0878-4bc4-8579-e903bba4e2a9 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/3b885180-0878-4bc4-8579-e903bba4e2a9 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/3babeed1-0a1e-4dc2-8e11-87381e7a949f b/docstore/3babeed1-0a1e-4dc2-8e11-87381e7a949f new file mode 100644 index 0000000000000000000000000000000000000000..e6f71e62f1d38e5969349ef563bd1d1143e3d3e1 --- /dev/null +++ b/docstore/3babeed1-0a1e-4dc2-8e11-87381e7a949f @@ -0,0 +1 @@ +shows you how to specify ambiance. Ambiance Prompt Generated output Color palettes play a vital role in photography, influencing the mood and conveying intended emotions. Try things like "muted orange warm tones," "natural light," "sunrise" or "sunset". For example, a warm, golden palette can infuse a romantic and atmospheric feel into a photograph. A close-up of a girl holding adorable golden retriever puppy in the park, sunlight. Cinematic close-up shot of a sad woman riding a bus in the rain, cool blue tones, sad mood. Use reference images to generate videos You can bring images to life by using Veo's image-to-video capability. You can use existing assets, or try Imagen to generate something new. Prompt Generated output Bunny with a chocolate candy bar. Bunny runs away. Negative prompts Negative prompts can be a powerful tool to help specify elements you don't want in the video. Describe what you want to discourage the model from generating after the phrase "Negative prompt". Follow these tips: ❌ Don't use instructive language or words like no or don't . For example, "No walls" or "don't show walls". ✅ Do describe what you don't want to see. For example, "wall, frame", which means that you don't want a wall or a frame in the video. Prompt Generated output Generate a short, stylized animation of a large, solitary oak tree with leaves blowing vigorously in a strong wind. The tree should have a slightly exaggerated, whimsical form, with dynamic, flowing branches. The leaves should display a variety of autumn colors, swirling and dancing in the wind. The animation should use a warm, inviting color palette. Generate a short, stylized animation of a large, solitary oak tree with leaves blowing vigorously in a strong wind. The tree should have a slightly exaggerated, whimsical form, with dynamic, flowing branches. The leaves should display a variety of autumn colors, swirling and dancing in the wind. The animation should use a warm, inviting color palette. With negative \ No newline at end of file diff --git a/docstore/3bb4f28b-b244-4f00-80c3-32c9b966f6cb b/docstore/3bb4f28b-b244-4f00-80c3-32c9b966f6cb new file mode 100644 index 0000000000000000000000000000000000000000..7645b864913317d4ec923e00d51796055880e22d --- /dev/null +++ b/docstore/3bb4f28b-b244-4f00-80c3-32c9b966f6cb @@ -0,0 +1 @@ +https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:batchGenerateContent \ -X POST \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type:application/json" \ -d "{ 'batch': { 'display_name': 'my-batch-requests', 'input_config': { 'requests': { 'file_name': ${ BATCH_INPUT_FILE } } } } }" When you create a batch job, you will get a job name returned. Use this name for monitoring the job status as well as retrieving the results once the job completes. The following is an example output that contains a job name: Created batch job from file: batches/123456789 Monitoring job status Use the operation name obtained when creating the batch job to poll its status. The state field of the batch job will indicate its current status. A batch job can be in one of the following states: JOB_STATE_PENDING : The job has been created and is waiting to be processed by the service. JOB_STATE_SUCCEEDED : The job completed successfully. You can now retrieve the results. JOB_STATE_FAILED : The job failed. Check the error details for more information. JOB_STATE_CANCELLED : The job was cancelled by the user. You can poll the job status periodically to check for completion. Python # Use the name of the job you want to check # e.g., inline_batch_job.name from the previous step job_name = "YOUR_BATCH_JOB_NAME" # (e.g. 'batches/your-batch-id') batch_job = client . batches . get ( name = job_name ) completed_states = set ([ 'JOB_STATE_SUCCEEDED' , 'JOB_STATE_FAILED' , 'JOB_STATE_CANCELLED' , ]) print ( f "Polling status for job: { job_name } " ) batch_job = client . batches . get ( name = job_name ) # Initial get while batch_job . state . name not in completed_states : print ( f "Current state: { batch_job . state . name } " ) time . sleep ( 30 ) # Wait for 30 seconds before polling again batch_job = client . batches . get ( name = job_name ) print ( f "Job finished with state: { batch_job . state . name } " ) if batch_job . state . name == 'JOB_STATE_FAILED' : print ( f \ No newline at end of file diff --git a/docstore/3becc5e6-2b2b-4c13-ad12-3c9e2064baed b/docstore/3becc5e6-2b2b-4c13-ad12-3c9e2064baed new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/3becc5e6-2b2b-4c13-ad12-3c9e2064baed @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/3bf1ec4b-4978-4a68-a936-e96b6595d8d8 b/docstore/3bf1ec4b-4978-4a68-a936-e96b6595d8d8 new file mode 100644 index 0000000000000000000000000000000000000000..c7d99b48acdb29ebe1cdd75df52d7215dd4d0ab1 --- /dev/null +++ b/docstore/3bf1ec4b-4978-4a68-a936-e96b6595d8d8 @@ -0,0 +1 @@ +- Zsh Zsh is a common Linux and macOS terminal configuration. You can check if you have a configuration file for it by running the following command: ~/.zshrc If the response is "No such file or directory", you will need to create this file and open it by running the following commands, or use bash : touch ~/.zshrc open ~/.zshrc Next, you need to set your API key by adding the following export command: export GEMINI_API_KEY = After saving the file, apply the changes by running: source ~/.zshrc Windows Search for "Environment Variables" in the system settings Edit either "User variables" (for current user) or "System variables" (for all users - use with caution). Create the variable and add export GEMINI_API_KEY=your_key_here Apply the changes Providing API key explicitly In some cases, you may want to explicitly provide an API key. For example: You're doing a simple API call and prefer hard coding the API key. You want explicit control without having to rely on automatic discovery of environment variables by the Gemini API libraries You're using an environment where environment variables are not supported (e.g web) or you are making REST calls. Below are examples for how you can provide an API key explicitly: Python from google import genai client = genai . Client ( api_key = " YOUR_API_KEY " ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Explain how AI works in a few words" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : " YOUR_API_KEY " }); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works in a few words" , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , & genai . \ No newline at end of file diff --git a/docstore/3c0ebbaf-eb3f-45ed-88ca-95763fc91c6a b/docstore/3c0ebbaf-eb3f-45ed-88ca-95763fc91c6a new file mode 100644 index 0000000000000000000000000000000000000000..29805a750d326aab08740367fc13678c1846ec09 --- /dev/null +++ b/docstore/3c0ebbaf-eb3f-45ed-88ca-95763fc91c6a @@ -0,0 +1 @@ +default. Here, you disable it. from google import genai from google.genai import types client = genai . Client () def get_current_weather ( location : str ) - > str : """Get the current whether in a given location. Args: location: required, The city and state, e.g. San Franciso, CA unit: celsius or fahrenheit """ print ( f 'Called with: { location =} ' ) return "23C" response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = "What is the weather like in Boston?" , config = types . GenerateContentConfig ( tools = [ get_current_weather ], automatic_function_calling = { 'disable' : True }, ), ) function_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call Automatic function calling Before Python The old SDK only supports automatic function calling in chat. In the new SDK this is the default behavior in generate_content . import google.generativeai as genai def get_current_weather ( city : str ) - > str : return "23C" model = genai . GenerativeModel ( model_name = "gemini-1.5-flash" , tools = [ get_current_weather ] ) chat = model . start_chat ( enable_automatic_function_calling = True ) result = chat . send_message ( "What is the weather in San Francisco?" ) After Python from google import genai from google.genai import types client = genai . Client () def get_current_weather ( city : str ) - > str : return "23C" response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = "What is the weather like in Boston?" , config = types . GenerateContentConfig ( tools = [ get_current_weather ] ), ) Code execution Code execution is a tool that allows the model to generate Python code, run it, and return the result. Before Python import google.generativeai as genai model = genai . GenerativeModel ( model_name = "gemini-1.5-flash" , tools = "code_execution" ) result = model . generate_content ( "What is the sum of the first 50 prime numbers? Generate and run code for " "the calculation, and make sure you \ No newline at end of file diff --git a/docstore/3c19b39d-b38b-46f7-8da3-7487c2d521f7 b/docstore/3c19b39d-b38b-46f7-8da3-7487c2d521f7 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/3c19b39d-b38b-46f7-8da3-7487c2d521f7 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/3c3eeae8-022e-4717-a2be-aba48b572f64 b/docstore/3c3eeae8-022e-4717-a2be-aba48b572f64 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/3c3eeae8-022e-4717-a2be-aba48b572f64 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/3c532a29-38ad-429d-81cd-5b8fe607e989 b/docstore/3c532a29-38ad-429d-81cd-5b8fe607e989 new file mode 100644 index 0000000000000000000000000000000000000000..9b6431ae9c97257d5ff4628dd401f203e2f83eb3 --- /dev/null +++ b/docstore/3c532a29-38ad-429d-81cd-5b8fe607e989 @@ -0,0 +1 @@ +Use curl to send a POST request to the predictLongRunning endpoint. # The request body includes the prompt for video generation. curl " ${ BASE_URL } /models/veo-2.0-generate-001:predictLongRunning" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -X "POST" \ -d '{ "instances": [{ "prompt": "Panning wide shot of a calico kitten sleeping in the sunshine" } ], "parameters": { "aspectRatio": "16:9", "personGeneration": "dont_allow", } }' | tee result.json | jq .name | sed 's/"//g' > op_name # Obtain operation name to download video. op_name = $( cat op_name ) # Check against status of operation. while true ; do is_done = $( curl -H "x-goog-api-key: $GEMINI_API_KEY " " ${ BASE_URL } / ${ op_name } " | tee op_check.json | jq .done ) if [ " ${ is_done } " = "true" ] ; then cat op_check.json echo "** Attach API_KEY to download video, or examine error message." break fi echo "** Video ${ op_name } has not downloaded yet! Check again after 5 seconds..." # Wait for 5 seoncds to check again. sleep 5 done This code takes about 2-3 minutes to run, though it may take longer if resources are constrained. Once it's done running, you should see a video that looks something like this: If you see an error message instead of a video, this means that resources are constrained and your request couldn't be completed. In this case, run the code again. Generated videos are stored on the server for 2 days, after which they are removed. If you want to save a local copy of your generated video, you must run result() and save() within 2 days of generation. Generate from images You can also generate videos using images. The following code generates an image using Imagen, then uses the generated image as the starting frame for the generated video. First, generate an image using Imagen : Python prompt = "Panning wide shot of a calico kitten sleeping in the sunshine" , imagen = client . models . generate_images ( model = "imagen-3.0-generate-002" , prompt = prompt , \ No newline at end of file diff --git a/docstore/3c668ba2-7da8-4fc4-bfb1-b97b3774db1c b/docstore/3c668ba2-7da8-4fc4-bfb1-b97b3774db1c new file mode 100644 index 0000000000000000000000000000000000000000..805d10be613a3c55006267c8f5d9ce17fbca450d --- /dev/null +++ b/docstore/3c668ba2-7da8-4fc4-bfb1-b97b3774db1c @@ -0,0 +1 @@ +declaration can include the following parameters: name (string): A unique name for the function ( get_weather_forecast , send_email ). Use descriptive names without spaces or special characters (use underscores or camelCase). description (string): A clear and detailed explanation of the function's purpose and capabilities. This is crucial for the model to understand when to use the function. Be specific and provide examples if helpful ("Finds theaters based on location and optionally movie title which is currently playing in theaters."). parameters (object): Defines the input parameters the function expects. type (string): Specifies the overall data type, such as object . properties (object): Lists individual parameters, each with: type (string): The data type of the parameter, such as string , integer , boolean, array . description (string): A description of the parameter's purpose and format. Provide examples and constraints ("The city and state, e.g., 'San Francisco, CA' or a zip code e.g., '95616'."). enum (array, optional): If the parameter values are from a fixed set, use "enum" to list the allowed values instead of just describing them in the description. This improves accuracy ("enum": ["daylight", "cool", "warm"]). required (array): An array of strings listing the parameter names that are mandatory for the function to operate. Function calling with thinking Enabling "thinking" can improve function call performance by allowing the model to reason through a request before suggesting function calls. However, because the Gemini API is stateless, this reasoning context is lost between turns, which can reduce the quality of function calls as they require multiple turn requests. To preserve this context you can use thought signatures. A thought signature is an encrypted representation of the model's internal thought process that you pass back to the model on subsequent turns. To use thought signatures: Receive the signature: When thinking is enabled, the API \ No newline at end of file diff --git a/docstore/3c688d95-5bd6-4882-9965-6d68a51fe6be b/docstore/3c688d95-5bd6-4882-9965-6d68a51fe6be new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/3c688d95-5bd6-4882-9965-6d68a51fe6be @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/3c7ef2c3-362d-4747-b567-bc62036c3fac b/docstore/3c7ef2c3-362d-4747-b567-bc62036c3fac new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/3c7ef2c3-362d-4747-b567-bc62036c3fac @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/3c7f9f5b-3cf0-4680-865c-03cbfefd9a2f b/docstore/3c7f9f5b-3cf0-4680-865c-03cbfefd9a2f new file mode 100644 index 0000000000000000000000000000000000000000..6d0fb3dda30537d366b10ce141412a20984aa796 --- /dev/null +++ b/docstore/3c7f9f5b-3cf0-4680-865c-03cbfefd9a2f @@ -0,0 +1 @@ +where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels. """ config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # set thinking_budget to 0 for better results in object detection ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ prompt , im ], # Pillow images can be directly passed as inputs (which will be converted by the SDK) config = config ) # Parse JSON response items = json . loads ( parse_json ( response . text )) # Create output directory os . makedirs ( output_dir , exist_ok = True ) # Process each mask for i , item in enumerate ( items ): # Get bounding box coordinates box = item [ "box_2d" ] y0 = int ( box [ 0 ] / 1000 * im . size [ 1 ]) x0 = int ( box [ 1 ] / 1000 * im . size [ 0 ]) y1 = int ( box [ 2 ] / 1000 * im . size [ 1 ]) x1 = int ( box [ 3 ] / 1000 * im . size [ 0 ]) # Skip invalid boxes if y0 > = y1 or x0 > = x1 : continue # Process mask png_str = item [ "mask" ] if not png_str . startswith ( "data:image/png;base64," ): continue # Remove prefix png_str = png_str . removeprefix ( "data:image/png;base64," ) mask_data = base64 . b64decode ( png_str ) mask = Image . open ( io . BytesIO ( mask_data )) # Resize mask to match bounding box mask = mask . resize (( x1 - x0 , y1 - y0 ), Image . Resampling . BILINEAR ) # Convert mask to numpy array for processing mask_array = np . array ( mask ) # Create overlay for this mask overlay = Image . new ( 'RGBA' , im . size , ( 0 , 0 , 0 , 0 )) overlay_draw = ImageDraw . Draw ( overlay ) # Create overlay for the mask color = ( 255 , 255 , 255 , 200 ) for y in range ( y0 , y1 ): for x in range ( x0 , x1 ): if mask_array [ y - y0 , x - x0 ] > 128 : # Threshold for mask overlay_draw . point (( x , y ), fill = color ) # Save individual mask and its overlay mask_filename = f " { item [ 'label' ] } _ { i } _mask.png" \ No newline at end of file diff --git a/docstore/3c8733e7-0c23-4e57-8680-66a68f930ef5 b/docstore/3c8733e7-0c23-4e57-8680-66a68f930ef5 new file mode 100644 index 0000000000000000000000000000000000000000..bb8a5c387274cea53762666893b97a549fe37a5c --- /dev/null +++ b/docstore/3c8733e7-0c23-4e57-8680-66a68f930ef5 @@ -0,0 +1 @@ +Client () response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = "Provide a list of 3 famous physicists and their key contributions" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 1024 ) # Turn off thinking: # thinking_config=types.ThinkingConfig(thinking_budget=0) # Turn on dynamic thinking: # thinking_config=types.ThinkingConfig(thinking_budget=-1) ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : "Provide a list of 3 famous physicists and their key contributions" , config : { thinkingConfig : { thinkingBudget : 1024 , // Turn off thinking: // thinkingBudget: 0 // Turn on dynamic thinking: // thinkingBudget: -1 }, }, }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "google.golang.org/genai" "os" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } thinkingBudgetVal := int32 ( 1024 ) contents := genai . Text ( "Provide a list of 3 famous physicists and their key contributions" ) model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : & thinkingBudgetVal , // Turn off thinking: // ThinkingBudget: int32(0), // Turn on dynamic thinking: // ThinkingBudget: int32(-1), }, }) fmt . Println ( resp . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Provide a list of 3 famous physicists and their key contributions" } ] } ], "generationConfig": { \ No newline at end of file diff --git a/docstore/3c98946c-5984-40ec-b1df-c00de41e0225 b/docstore/3c98946c-5984-40ec-b1df-c00de41e0225 new file mode 100644 index 0000000000000000000000000000000000000000..ddc1ec68807ed0017d00c5153db6b826d6e2aced --- /dev/null +++ b/docstore/3c98946c-5984-40ec-b1df-c00de41e0225 @@ -0,0 +1 @@ +"GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); async function main () { const model = await openai . models . retrieve ( "gemini-2.0-flash" ); console . log ( model . id ); } main (); REST curl https://generativelanguage.googleapis.com/v1beta/openai/models/gemini-2.0-flash \ -H "Authorization: Bearer GEMINI_API_KEY" Current limitations Support for the OpenAI libraries is still in beta while we extend feature support. If you have questions about supported parameters, upcoming features, or run into any issues getting started with Gemini, join our Developer Forum . What's next Try our OpenAI Compatibility Colab to work through more detailed examples. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-18 UTC. \ No newline at end of file diff --git a/docstore/3c9db5f5-bac3-4360-9dbd-a430e453d4db b/docstore/3c9db5f5-bac3-4360-9dbd-a430e453d4db new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/3c9db5f5-bac3-4360-9dbd-a430e453d4db @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/3ca883a4-30e4-43b6-a969-058812f03c7c b/docstore/3ca883a4-30e4-43b6-a969-058812f03c7c new file mode 100644 index 0000000000000000000000000000000000000000..6e0fa692ae99c1202fd2c5b8466f6a5c4bfd32f9 --- /dev/null +++ b/docstore/3ca883a4-30e4-43b6-a969-058812f03c7c @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting_with_media Title: Files API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/3cc654ac-673d-43bd-a5ef-1428b9cb2eb0 b/docstore/3cc654ac-673d-43bd-a5ef-1428b9cb2eb0 new file mode 100644 index 0000000000000000000000000000000000000000..1b590529498df147a73d1ae4a22f439d08a36cb6 --- /dev/null +++ b/docstore/3cc654ac-673d-43bd-a5ef-1428b9cb2eb0 @@ -0,0 +1 @@ +from "node:fs" ; const ai = new GoogleGenAI ({}); const base64ImageFile = fs . readFileSync ( "path/to/small-sample.jpg" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "image/jpeg" , data : base64ImageFile , }, }, { text : "Caption this image." }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go bytes , _ := os . ReadFile ( "path/to/small-sample.jpg" ) parts := [] * genai . Part { genai . NewPartFromBytes ( bytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST IMG_PATH = "/path/to/your/image1.jpg" if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "' " $( base64 $B64FLAGS $IMG_PATH ) " '" } }, {"text": "Caption this image."}, ] }] }' 2 > /dev/null You can also fetch an image from a URL, convert it to bytes, and pass it to generateContent as shown in the following examples. Python from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests . get ( image_path ) . content image = types . Part . from_bytes ( data = image_bytes , mime_type = "image/jpeg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is this image?" , image ], ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; \ No newline at end of file diff --git a/docstore/3cf09d83-6840-4b73-94fc-1a3783c55037 b/docstore/3cf09d83-6840-4b73-94fc-1a3783c55037 new file mode 100644 index 0000000000000000000000000000000000000000..505812909ecb21ec1bec9ec74047ffd478d04055 --- /dev/null +++ b/docstore/3cf09d83-6840-4b73-94fc-1a3783c55037 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/batch-mode Title: Batch Mode | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/3d0174c5-f905-47da-9bfa-f409ee8f2524 b/docstore/3d0174c5-f905-47da-9bfa-f409ee8f2524 new file mode 100644 index 0000000000000000000000000000000000000000..465170b0f695eefaa56d7f3871f4f3df2424da62 --- /dev/null +++ b/docstore/3d0174c5-f905-47da-9bfa-f409ee8f2524 @@ -0,0 +1 @@ +brightness color_temp: Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. Returns: A dictionary containing the set brightness and color temperature. """ return { "brightness" : brightness , "colorTemperature" : color_temp } JavaScript import { Type } from '@google/genai' ; // Define a function that the model can call to control smart lights const setLightValuesFunctionDeclaration = { name : 'set_light_values' , description : 'Sets the brightness and color temperature of a light.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'Light level from 0 to 100. Zero is off and 100 is full brightness' , }, color_temp : { type : Type . STRING , enum : [ 'daylight' , 'cool' , 'warm' ], description : 'Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`.' , }, }, required : [ 'brightness' , 'color_temp' ], }, }; /** * Set the brightness and color temperature of a room light. (mock API) * @param {number} brightness - Light level from 0 to 100. Zero is off and 100 is full brightness * @param {string} color_temp - Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. * @return {Object} A dictionary containing the set brightness and color temperature. */ function setLightValues ( brightness , color_temp ) { return { brightness : brightness , colorTemperature : color_temp }; } Step 2: Call the model with function declarations Once you have defined your function declarations, you can prompt the model to use them. It analyzes the prompt and function declarations and decides whether to respond directly or to call a function. If a function is called, the response object will contain a function call suggestion. Python from google.genai import types # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ set_light_values_declaration ]) config = types . GenerateContentConfig ( tools = [ \ No newline at end of file diff --git a/docstore/3d22e8fc-9cc1-45a9-9f3e-73196487cdf2 b/docstore/3d22e8fc-9cc1-45a9-9f3e-73196487cdf2 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/3d22e8fc-9cc1-45a9-9f3e-73196487cdf2 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/3d51da65-5365-424b-8108-abe5a63fa694 b/docstore/3d51da65-5365-424b-8108-abe5a63fa694 new file mode 100644 index 0000000000000000000000000000000000000000..46dc106c387700742db50f2912cf28b003e737e9 --- /dev/null +++ b/docstore/3d51da65-5365-424b-8108-abe5a63fa694 @@ -0,0 +1 @@ +ClientConfig { APIKey : " YOUR_API_KEY " , Backend : genai . BackendGeminiAPI , }) if err != nil { log . Fatal ( err ) } result , err := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) if err != nil { log . Fatal ( err ) } fmt . Println ( result . Text ()) } Java package com.example ; import com.google.genai.Client ; import com.google.genai.types.GenerateContentResponse ; public class GenerateTextFromTextInput { public static void main ( String [] args ) { Client client = Client . builder (). apiKey ( " YOUR_API_KEY " ). build (); GenerateContentResponse response = client . models . generateContent ( "gemini-2.5-flash" , "Explain how AI works in a few words" , null ); System . out . println ( response . text ()); } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key= $ YOUR_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works in a few words" } ] } ] }' Keep your API key secure Treat your Gemini API key like a password. If compromised, others can use your project's quota, incur charges (if billing is enabled), and access your private data, such as files. Critical security rules Never commit API keys to source control. Do not check your API key into version control systems like Git. Never expose API keys on the client-side. Do not use your API key directly in web or mobile apps in production. Keys in client-side code (including our JavaScript/TypeScript libraries and REST calls) can be extracted. Best practices Use server-side calls with API keys The most secure way to use your API key is to call the Gemini API from a server-side application where the key can be kept confidential. Use ephemeral tokens for client-side access (Live API only): For direct client-side access to the Live API, you can use ephemeral tokens. They come with lower security risks and can be \ No newline at end of file diff --git a/docstore/3d52ba65-add2-412e-b41d-ef21e4da04d6 b/docstore/3d52ba65-add2-412e-b41d-ef21e4da04d6 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/3d52ba65-add2-412e-b41d-ef21e4da04d6 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/3d716c9a-8e24-4d12-aaf0-2f753d0bddb3 b/docstore/3d716c9a-8e24-4d12-aaf0-2f753d0bddb3 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/3d716c9a-8e24-4d12-aaf0-2f753d0bddb3 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/3d72550c-2bcb-49a9-8519-5b54506af111 b/docstore/3d72550c-2bcb-49a9-8519-5b54506af111 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/3d72550c-2bcb-49a9-8519-5b54506af111 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/3d86d3f9-92f5-42cd-b4ef-905eca413bff b/docstore/3d86d3f9-92f5-42cd-b4ef-905eca413bff new file mode 100644 index 0000000000000000000000000000000000000000..9dd2717ee06a5cb6666e2976e86c3492640fe1f5 --- /dev/null +++ b/docstore/3d86d3f9-92f5-42cd-b4ef-905eca413bff @@ -0,0 +1 @@ +'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Refer to the GenerateContentConfig in our API reference for a complete list of configurable parameters and their descriptions. Multimodal inputs The Gemini API supports multimodal inputs, allowing you to combine text with media files. The following example demonstrates providing an image: Python from PIL import Image from google import genai client = genai . Client () image = Image . open ( "/path/to/organ.png" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , "Tell me about this instrument" ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const image = await ai . files . upload ({ file : "/path/to/organ.png" , }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ createUserContent ([ "Tell me about this instrument" , createPartFromUri ( image . uri , image . mimeType ), ]), ], }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imagePath := "/path/to/organ.jpg" imgData , _ := os . ReadFile ( imagePath ) parts := [] * genai . Part { genai . NewPartFromText ( "Tell me about this instrument" ), & genai . Part { InlineData : & genai . Blob { MIMEType : "image/jpeg" , Data : imgData , \ No newline at end of file diff --git a/docstore/3d8fd3c8-aaf1-4a5b-8fa6-584b9dc20964 b/docstore/3d8fd3c8-aaf1-4a5b-8fa6-584b9dc20964 new file mode 100644 index 0000000000000000000000000000000000000000..e6f71e62f1d38e5969349ef563bd1d1143e3d3e1 --- /dev/null +++ b/docstore/3d8fd3c8-aaf1-4a5b-8fa6-584b9dc20964 @@ -0,0 +1 @@ +shows you how to specify ambiance. Ambiance Prompt Generated output Color palettes play a vital role in photography, influencing the mood and conveying intended emotions. Try things like "muted orange warm tones," "natural light," "sunrise" or "sunset". For example, a warm, golden palette can infuse a romantic and atmospheric feel into a photograph. A close-up of a girl holding adorable golden retriever puppy in the park, sunlight. Cinematic close-up shot of a sad woman riding a bus in the rain, cool blue tones, sad mood. Use reference images to generate videos You can bring images to life by using Veo's image-to-video capability. You can use existing assets, or try Imagen to generate something new. Prompt Generated output Bunny with a chocolate candy bar. Bunny runs away. Negative prompts Negative prompts can be a powerful tool to help specify elements you don't want in the video. Describe what you want to discourage the model from generating after the phrase "Negative prompt". Follow these tips: ❌ Don't use instructive language or words like no or don't . For example, "No walls" or "don't show walls". ✅ Do describe what you don't want to see. For example, "wall, frame", which means that you don't want a wall or a frame in the video. Prompt Generated output Generate a short, stylized animation of a large, solitary oak tree with leaves blowing vigorously in a strong wind. The tree should have a slightly exaggerated, whimsical form, with dynamic, flowing branches. The leaves should display a variety of autumn colors, swirling and dancing in the wind. The animation should use a warm, inviting color palette. Generate a short, stylized animation of a large, solitary oak tree with leaves blowing vigorously in a strong wind. The tree should have a slightly exaggerated, whimsical form, with dynamic, flowing branches. The leaves should display a variety of autumn colors, swirling and dancing in the wind. The animation should use a warm, inviting color palette. With negative \ No newline at end of file diff --git a/docstore/3d9d6605-e5ff-4bef-b8c7-6b9ac81cd2f4 b/docstore/3d9d6605-e5ff-4bef-b8c7-6b9ac81cd2f4 new file mode 100644 index 0000000000000000000000000000000000000000..e7df3841ca47d093238d5ecfeb9f7cc95942f8e5 --- /dev/null +++ b/docstore/3d9d6605-e5ff-4bef-b8c7-6b9ac81cd2f4 @@ -0,0 +1 @@ +data image2Path := "path/to/image2.jpeg" imgBytes , _ := os . ReadFile ( image2Path ) parts := [] * genai . Part { genai . NewPartFromText ( "What is different between these two images?" ), genai . NewPartFromBytes ( imgBytes , "image/jpeg" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST # Upload the first image IMAGE1_PATH = "path/to/image1.jpg" MIME1_TYPE = $( file -b --mime-type " ${ IMAGE1_PATH } " ) NUM1_BYTES = $( wc -c < " ${ IMAGE1_PATH } " ) DISPLAY_NAME1 = IMAGE1 tmp_header_file1 = upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME1_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME1 } '}}" 2 > /dev/null upload_url1 = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file1 } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file1 } " curl " ${ upload_url1 } " \ -H "Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE1_PATH } " 2 > /dev/null > file_info1.json file1_uri = $( jq ".file.uri" file_info1.json ) echo file1_uri = $file1_uri # Prepare the second image (inline) IMAGE2_PATH = "path/to/image2.png" MIME2_TYPE = $( file -b --mime-type " ${ IMAGE2_PATH } " ) if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMAGE2_BASE64 = $( base64 $B64FLAGS $IMAGE2_PATH ) # Now generate content using both images curl \ No newline at end of file diff --git a/docstore/3db321f1-4eed-459c-b147-9a65f55f24de b/docstore/3db321f1-4eed-459c-b147-9a65f55f24de new file mode 100644 index 0000000000000000000000000000000000000000..3ecdc47bdfdb376a1d8226f76a3e20fc1fff4015 --- /dev/null +++ b/docstore/3db321f1-4eed-459c-b147-9a65f55f24de @@ -0,0 +1 @@ +"createTunedModel" : print ( m . name ) break # create tuning model training_dataset = types . TuningDataset ( examples = [ types . TuningExample ( text_input = f 'input { i } ' , output = f 'output { i } ' , ) for i in range ( 5 ) ], ) tuning_job = client . tunings . tune ( base_model = 'models/gemini-1.5-flash-001-tuning' , training_dataset = training_dataset , config = types . CreateTuningJobConfig ( epoch_count = 5 , batch_size = 4 , learning_rate = 0.001 , tuned_model_display_name = "test tuned model" ) ) # generate content with the tuned model response = client . models . generate_content ( model = tuning_job . tuned_model . model , contents = '55' , ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/docstore/3db50abf-2079-4949-a965-54f0f80c453b b/docstore/3db50abf-2079-4949-a965-54f0f80c453b new file mode 100644 index 0000000000000000000000000000000000000000..40517314fd91c121847408df8a1f7fc600adf0b3 --- /dev/null +++ b/docstore/3db50abf-2079-4949-a965-54f0f80c453b @@ -0,0 +1 @@ +string, "nullable": boolean, "enum": [ string ], "maxItems": integer, "minItems": integer, "properties": { string: { object (Schema) }, ... }, "required": [ string ], "propertyOrdering": [ string ], "items": { object (Schema) } } The Type of the schema must be one of the OpenAPI Data Types , or a union of those types (using anyOf ). Only a subset of fields is valid for each Type . The following list maps each Type to a subset of the fields that are valid for that type: string -> enum , format , nullable integer -> format , minimum , maximum , enum , nullable number -> format , minimum , maximum , enum , nullable boolean -> nullable array -> minItems , maxItems , items , nullable object -> properties , required , propertyOrdering , nullable Here are some example schemas showing valid type-and-field combinations: { "type" : "string" , "enum" : [ "a" , "b" , "c" ] } { "type" : "string" , "format" : "date-time" } { "type" : "integer" , "format" : "int64" } { "type" : "number" , "format" : "double" } { "type" : "boolean" } { "type" : "array" , "minItems" : 3 , "maxItems" : 3 , "items" : { "type" : ... } } { "type" : "object" , "properties" : { "a" : { "type" : ... }, "b" : { "type" : ... }, "c" : { "type" : ... } }, "nullable" : true , "required" : [ "c" ], "propertyOrdering" : [ "c" , "b" , "a" ] } For complete documentation of the Schema fields as they're used in the Gemini API, see the Schema reference . Property ordering Warning: When you're configuring a JSON schema, make sure to set propertyOrdering[] , and when you provide examples, make sure that the property ordering in the examples matches the schema. When you're working with JSON schemas in the Gemini API, the order of properties is important. By default, the API orders properties alphabetically and does not preserve the order in which the properties are defined (although the Google Gen AI SDKs may preserve this order). If you're providing examples to the model with a schema configured, and the property \ No newline at end of file diff --git a/docstore/3dc9eee7-1e37-49a3-8fe2-4cc493663d9f b/docstore/3dc9eee7-1e37-49a3-8fe2-4cc493663d9f new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/3dc9eee7-1e37-49a3-8fe2-4cc493663d9f @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/3de4d7c1-f06c-463e-ba4c-11fe2efa3f30 b/docstore/3de4d7c1-f06c-463e-ba4c-11fe2efa3f30 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/3de4d7c1-f06c-463e-ba4c-11fe2efa3f30 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/3dec2357-ecb6-49b4-bfb3-1655f25d11f2 b/docstore/3dec2357-ecb6-49b4-bfb3-1655f25d11f2 new file mode 100644 index 0000000000000000000000000000000000000000..2dd19cc1c2d77863f4a301d62456871308dd74fd --- /dev/null +++ b/docstore/3dec2357-ecb6-49b4-bfb3-1655f25d11f2 @@ -0,0 +1 @@ +"temperature" ], }, }, ], }, ]; // Prompt for the model let contents = [ { role : "user" , parts : [ { text : "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , }, ], }, ]; // Loop until the model has no more function calls to make while ( true ) { const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents , config : { tools }, }); if ( result . functionCalls && result . functionCalls . length > 0 ) { const functionCall = result . functionCalls [ 0 ]; const { name , args } = functionCall ; if ( ! toolFunctions [ name ]) { throw new Error ( `Unknown function call: ${ name } ` ); } // Call the function and get the response. const toolResponse = toolFunctions [ name ]( args ); const functionResponsePart = { name : functionCall . name , response : { result : toolResponse , }, }; // Send the function response back to the model. contents . push ({ role : "model" , parts : [ { functionCall : functionCall , }, ], }); contents . push ({ role : "user" , parts : [ { functionResponse : functionResponsePart , }, ], }); } else { // No more function calls, break the loop. console . log ( result . text ); break ; } } Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . It 's 25°C in London, so I' ve set the thermostat to 20 ° C . Compositional function calling is a native Live API feature. This means Live API can handle the function calling similar to the Python SDK. Python # Light control schemas turn_on_the_lights_schema = { 'name' : 'turn_on_the_lights' } \ No newline at end of file diff --git a/docstore/3e1cca8a-3f15-46ec-83f0-021386054b1a b/docstore/3e1cca8a-3f15-46ec-83f0-021386054b1a new file mode 100644 index 0000000000000000000000000000000000000000..6b1a11d386f4b560f93e6fc6fce6c7f46a05bdf0 --- /dev/null +++ b/docstore/3e1cca8a-3f15-46ec-83f0-021386054b1a @@ -0,0 +1 @@ +responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } else if ( message . toolCall ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Compute the largest prime palindrome under 100000.' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . modelTurn && turn . serverContent . modelTurn . parts ) { for ( const part of turn . serverContent . modelTurn . parts ) { if ( part . text ) { console . debug ( 'Received text: %s\n' , part . text ); } else if ( part . executableCode ) { console . debug ( 'executableCode: %s\n' , part . executableCode . code ); } else if ( part . codeExecutionResult ) { console . debug ( 'codeExecutionResult: %s\n' , part . codeExecutionResult . output ); } } } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Grounding with Google Search You can enable Grounding with Google Search as part of the session configuration. This increases the Live API's accuracy and prevents hallucinations. See the Grounding tutorial to learn more. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = \ No newline at end of file diff --git a/docstore/3e2b121d-a9ff-44f7-9bf7-180101007210 b/docstore/3e2b121d-a9ff-44f7-9bf7-180101007210 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/3e2b121d-a9ff-44f7-9bf7-180101007210 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/3e7f1f13-aeaa-437c-926e-1aaa9ab92196 b/docstore/3e7f1f13-aeaa-437c-926e-1aaa9ab92196 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/3e7f1f13-aeaa-437c-926e-1aaa9ab92196 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/3e8123a2-963c-41fe-8405-9ff8ac9365b4 b/docstore/3e8123a2-963c-41fe-8405-9ff8ac9365b4 new file mode 100644 index 0000000000000000000000000000000000000000..7c3d98af9909034f92832ab4dcc3a7220e5c9856 --- /dev/null +++ b/docstore/3e8123a2-963c-41fe-8405-9ff8ac9365b4 @@ -0,0 +1 @@ +operation = await ai . models . generateVideos ({ model : "veo-2.0-generate-001" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , image : { imageBytes : response . generatedImages [ 0 ]. image . imageBytes , // response from Imagen mimeType : "image/png" , }, config : { aspectRatio : "16:9" , numberOfVideos : 2 , }, }); while ( ! operation . done ) { await new Promise (( resolve ) = > setTimeout ( resolve , 10000 )); operation = await ai . operations . getVideosOperation ({ operation : operation , }); } operation . response ? . generatedVideos ? . forEach ( async ( generatedVideo , n ) = > { const resp = await fetch ( ` ${ generatedVideo . video ? . uri } & key=GEMINI_API_KEY` , // append your API key ); const writer = createWriteStream ( `video ${ n } .mp4` ); Readable . fromWeb ( resp . body ). pipe ( writer ); }); } main (); Go image := response . GeneratedImages [ 0 ]. Image videoConfig := & genai . GenerateVideosConfig { AspectRatio : "16:9" , NumberOfVideos : 2 , } operation , _ := client . Models . GenerateVideos ( ctx , "veo-2.0-generate-001" , "A dramatic scene based on the input image" , image , videoConfig , ) for ! operation . Done { time . Sleep ( 20 * time . Second ) operation , _ = client . Operations . GetVideosOperation ( ctx , operation , nil ) } for n , video := range operation . Response . GeneratedVideos { client . Files . Download ( ctx , video . Video , nil ) fname := fmt . Sprintf ( "video_with_image_input_%d.mp4" , n ) _ = os . WriteFile ( fname , video . Video . VideoBytes , 0644 ) } Veo model parameters (Naming conventions vary by programming language.) prompt : The text prompt for the video. When present, the image parameter is optional. image : The image to use as the first frame for the video. When present, the prompt parameter is optional. negativePrompt : Text string that describes anything you want to discourage the model from generating aspectRatio : Changes the aspect ratio of the generated video. \ No newline at end of file diff --git a/docstore/3e9e176d-0c20-4dff-bd4f-3aff3c93e1cf b/docstore/3e9e176d-0c20-4dff-bd4f-3aff3c93e1cf new file mode 100644 index 0000000000000000000000000000000000000000..54ff3139001cad531cb76ca5ae25b2688a321ffa --- /dev/null +++ b/docstore/3e9e176d-0c20-4dff-bd4f-3aff3c93e1cf @@ -0,0 +1 @@ +angle," "worms eye," "dolly shot," "zoom shot," "pan shot," and "tracking shot." Focus and lens effects: Use terms like "shallow focus," "deep focus," "soft focus," "macro lens," and "wide-angle lens" to achieve specific visual effects. Overall style and subject: Guide Veo's creative direction by specifying styles like "sci-fi," "romantic comedy," "action movie," or "animation." You can also describe the subjects and backgrounds you want, such as "cityscape," "nature," "vehicles," or "animals." Veo prompt guide This section of the Veo guide contains examples of videos you can create using Veo, and shows you how to modify prompts to produce distinct results. Safety filters Veo applies safety filters across Gemini to help ensure that generated videos and uploaded photos don't contain offensive content. Prompts that violate our terms and guidelines are blocked. Prompt writing basics Good prompts are descriptive and clear. To get your generated video as close as possible to what you want, start with identifying your core idea, and then refine your idea by adding keywords and modifiers. The following elements should be included in your prompt: Subject : The object, person, animal, or scenery that you want in your video. Context : The background or context in which the subject is placed. Action : What the subject is doing (for example, walking , running , or turning their head ). Style : This can be general or very specific. Consider using specific film style keywords, such as horror film , film noir , or animated styles like cartoon style. Camera motion : [Optional] What the camera is doing, such as aerial view , eye-level , top-down shot , or low-angle shot . Composition : [Optional] How the shot is framed, such as wide shot , close-up , or extreme close-up . Ambiance : [Optional] How the color and light contribute to the scene, such as blue tones , night , or warm tones . More tips for writing prompts The following tips help you write prompts that generate your videos: \ No newline at end of file diff --git a/docstore/3eb193a0-2e9b-41ba-89f2-ee50c040d44f b/docstore/3eb193a0-2e9b-41ba-89f2-ee50c040d44f new file mode 100644 index 0000000000000000000000000000000000000000..987408a9d5fce99ac4990dd87d63b0d860b9553d --- /dev/null +++ b/docstore/3eb193a0-2e9b-41ba-89f2-ee50c040d44f @@ -0,0 +1 @@ +overwhelming dominance of digital streaming and on-demand services; a "smart" approach would therefore require a highly specialized niche, a strong value proposition beyond just the disc, and likely a focus on community or collector markets rather than mainstream consumption. Here's a breakdown of what a smart DVD business in 2025 would likely entail: 1. Extreme Niche Specialization: You cannot compete with streaming libraries. Your business must focus on what digital doesn't offer easily or at all. This could include: * Out-of-Print & Rare Titles: Sourcing films or TV shows that have never been released digitally or are currently unavailable on major streaming platforms. .... (gemini-2.5-flash) Format responses with the completion strategy The completion strategy can also help format the response. The following example prompts the model to create an essay outline: Prompt: Create an outline for an essay about hummingbirds. Response: Here is a possible outline for an essay about hummingbirds: I. Introduction A. Hook: Start with a captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). B. Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds. C. Thesis Statement: State the main point of the essay, which could be about the remarkable adaptations, behaviors, and ecological importance of hummingbirds. II. Appearance and Identification ..... (gemini-2.5-flash) The prompt didn't specify the format for the outline and the model chose a format for you. To get the model to return an outline in a specific format, you can add text that represents the start of the outline and let the model complete it based on the pattern that you initiated. Prompt: Create an outline for an essay about hummingbirds. I. Introduction * Response: Okay, here is an outline for an essay about hummingbirds, starting from the introduction you provided: I. Introduction * Hook: Start with a \ No newline at end of file diff --git a/docstore/3eb9c252-9943-44cd-a67f-4422fda9f7b1 b/docstore/3eb9c252-9943-44cd-a67f-4422fda9f7b1 new file mode 100644 index 0000000000000000000000000000000000000000..ebd105342549a255faf01232b49ba70d20b000ef --- /dev/null +++ b/docstore/3eb9c252-9943-44cd-a67f-4422fda9f7b1 @@ -0,0 +1 @@ +the next tier. Why use the paid tier? When you enable billing and use the paid tier, you benefit from higher rate limits , and your prompts and responses aren't used to improve Google products. For more information on data use for paid services, see the terms of service . Cloud Billing The Gemini API uses Cloud Billing for billing services. To use the paid tier, you must set up Cloud Billing on your cloud project. After you've enabled Cloud Billing, you can use Cloud Billing tools to track spending, understand costs, make payments, and access Cloud Billing support. Enable billing You can enable Cloud Billing starting from Google AI Studio: Open Google AI Studio . In the bottom of the left sidebar, select Settings > Plan information . Click Set up Billing for your chosen project to enable Cloud Billing. Monitor usage After you enable Cloud Billing, you can monitor your usage of the Gemini API in the Google Cloud console . The service name for the API is generativelanguage.googleapis.com , and in the console the Gemini API is also referred to as the Generative Language API . To learn more, see the Google Cloud documentation on monitoring API usage . Frequently asked questions This section provides answers to frequently asked questions. What am I billed for? Gemini API pricing is based on the following: Input token count Output token count Cached token count Cached token storage duration For pricing information, see the pricing page . Where can I view my quota? You can view your quota and system limits in the Google Cloud console . How do I request more quota? To request more quota, follow the instructions at How to request an upgrade . Can I use the Gemini API for free in EEA (including EU), the UK, and CH? Yes, we make the free tier and paid tier available in many regions . If I set up billing with the Gemini API, will I be charged for my Google AI Studio usage? No, Google AI Studio usage remains free of charge regardless of if you set up billing across all supported \ No newline at end of file diff --git a/docstore/3edcb0f1-f4a1-4a6c-bb44-850258719530 b/docstore/3edcb0f1-f4a1-4a6c-bb44-850258719530 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/3edcb0f1-f4a1-4a6c-bb44-850258719530 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/3edf05ea-8575-4690-af4d-fd745844038f b/docstore/3edf05ea-8575-4690-af4d-fd745844038f new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/3edf05ea-8575-4690-af4d-fd745844038f @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/3eec5585-0b7a-4433-b0af-3852ca6a223b b/docstore/3eec5585-0b7a-4433-b0af-3852ca6a223b new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/3eec5585-0b7a-4433-b0af-3852ca6a223b @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/3eed1883-6839-4632-9a72-efeb2efc3fce b/docstore/3eed1883-6839-4632-9a72-efeb2efc3fce new file mode 100644 index 0000000000000000000000000000000000000000..58a0a85d29a664bbdbdb8d3a93c58460ef8ef554 --- /dev/null +++ b/docstore/3eed1883-6839-4632-9a72-efeb2efc3fce @@ -0,0 +1 @@ += []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); And here is a text-to-audio example. You can receive audio by setting AUDIO as response modality. This example saves the received data as WAV file: \ No newline at end of file diff --git a/docstore/3ef67dfd-0340-437c-ad3d-5c9a7e8e52a2 b/docstore/3ef67dfd-0340-437c-ad3d-5c9a7e8e52a2 new file mode 100644 index 0000000000000000000000000000000000000000..46dc106c387700742db50f2912cf28b003e737e9 --- /dev/null +++ b/docstore/3ef67dfd-0340-437c-ad3d-5c9a7e8e52a2 @@ -0,0 +1 @@ +ClientConfig { APIKey : " YOUR_API_KEY " , Backend : genai . BackendGeminiAPI , }) if err != nil { log . Fatal ( err ) } result , err := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) if err != nil { log . Fatal ( err ) } fmt . Println ( result . Text ()) } Java package com.example ; import com.google.genai.Client ; import com.google.genai.types.GenerateContentResponse ; public class GenerateTextFromTextInput { public static void main ( String [] args ) { Client client = Client . builder (). apiKey ( " YOUR_API_KEY " ). build (); GenerateContentResponse response = client . models . generateContent ( "gemini-2.5-flash" , "Explain how AI works in a few words" , null ); System . out . println ( response . text ()); } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key= $ YOUR_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works in a few words" } ] } ] }' Keep your API key secure Treat your Gemini API key like a password. If compromised, others can use your project's quota, incur charges (if billing is enabled), and access your private data, such as files. Critical security rules Never commit API keys to source control. Do not check your API key into version control systems like Git. Never expose API keys on the client-side. Do not use your API key directly in web or mobile apps in production. Keys in client-side code (including our JavaScript/TypeScript libraries and REST calls) can be extracted. Best practices Use server-side calls with API keys The most secure way to use your API key is to call the Gemini API from a server-side application where the key can be kept confidential. Use ephemeral tokens for client-side access (Live API only): For direct client-side access to the Live API, you can use ephemeral tokens. They come with lower security risks and can be \ No newline at end of file diff --git a/docstore/3f361755-1c00-497b-95db-6ae53f282f7e b/docstore/3f361755-1c00-497b-95db-6ae53f282f7e new file mode 100644 index 0000000000000000000000000000000000000000..70023e78d7b8c9459310371555beec5ba6328e28 --- /dev/null +++ b/docstore/3f361755-1c00-497b-95db-6ae53f282f7e @@ -0,0 +1 @@ +results reflects a single function call that the model has requested. To send the results back, include the responses in the same order as they were requested. The Python SDK supports automatic function calling , which automatically converts Python functions to declarations, handles the function call execution and response cycle for you. Following is an example for the disco use case. Note: Automatic Function Calling is a Python SDK only feature at the moment. Python from google import genai from google.genai import types # Actual function implementations def power_disco_ball_impl ( power : bool ) - > dict : """Powers the spinning disco ball. Args: power: Whether to turn the disco ball on or off. Returns: A status dictionary indicating the current state. """ return { "status" : f "Disco ball powered { 'on' if power else 'off' } " } def start_music_impl ( energetic : bool , loud : bool ) - > dict : """Play some music matching the specified parameters. Args: energetic: Whether the music is energetic or not. loud: Whether the music is loud or not. Returns: A dictionary containing the music settings. """ music_type = "energetic" if energetic else "chill" volume = "loud" if loud else "quiet" return { "music_type" : music_type , "volume" : volume } def dim_lights_impl ( brightness : float ) - > dict : """Dim the lights. Args: brightness: The brightness of the lights, 0.0 is off, 1.0 is full. Returns: A dictionary containing the new brightness setting. """ return { "brightness" : brightness } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ power_disco_ball_impl , start_music_impl , dim_lights_impl ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Do everything you need to this place into party!" , config = config , ) print ( " \n Example 2: Automatic function calling" ) print ( response . text ) # I've turned on the disco ball, started playing loud and \ No newline at end of file diff --git a/docstore/3f70bc11-21c7-4f5e-a4bb-bc84a45b773a b/docstore/3f70bc11-21c7-4f5e-a4bb-bc84a45b773a new file mode 100644 index 0000000000000000000000000000000000000000..8a34a1fe66a041005f53a5e081e09b0fa5f13242 --- /dev/null +++ b/docstore/3f70bc11-21c7-4f5e-a4bb-bc84a45b773a @@ -0,0 +1 @@ +Grounding with Google Search | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Grounding with Google Search Grounding with Google Search connects the Gemini model to real-time web content and works with all available languages . This allows Gemini to provide more accurate answers and cite verifiable sources beyond its knowledge cutoff. Grounding helps you build applications that can: Increase factual accuracy: Reduce model hallucinations by basing responses on real-world information. Access real-time information: Answer questions about recent events and topics. Provide citations: Build user trust by showing the sources for the model's claims. Python from google import genai from google.genai import types # Configure the client client = genai . Client () # Define the grounding tool grounding_tool = types . Tool ( google_search = types . GoogleSearch () ) # Configure generation settings config = types . GenerateContentConfig ( tools = [ grounding_tool ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Who won the euro 2024?" , config = config , ) # Print the grounded response print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Define the grounding tool const groundingTool = { googleSearch : {}, }; // Configure generation settings const config = { tools : [ groundingTool ], }; // Make the request const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Who won the euro 2024?" , config , }); // Print the grounded response console . log ( response . \ No newline at end of file diff --git a/docstore/3f795a51-274f-43c1-a59f-9e63b176af2d b/docstore/3f795a51-274f-43c1-a59f-9e63b176af2d new file mode 100644 index 0000000000000000000000000000000000000000..ba28dbc7ce7bec083cb930373e6fd594666aa933 --- /dev/null +++ b/docstore/3f795a51-274f-43c1-a59f-9e63b176af2d @@ -0,0 +1 @@ +my house?" , }); console . log ( "Chat response 2:" , response2 . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) res , _ := chat . SendMessage ( ctx , genai . Part { Text : "How many paws are in my house?" }) if len ( res . Candidates ) > 0 { fmt . Println ( res . Candidates [ 0 ]. Content . Parts [ 0 ]. Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' \ No newline at end of file diff --git a/docstore/3face7cf-4ace-4c11-89fa-37ee24cd172f b/docstore/3face7cf-4ace-4c11-89fa-37ee24cd172f new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/3face7cf-4ace-4c11-89fa-37ee24cd172f @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/3fade572-fcfe-4af4-b22f-e83be5863d7a b/docstore/3fade572-fcfe-4af4-b22f-e83be5863d7a new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/3fade572-fcfe-4af4-b22f-e83be5863d7a @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/3fce31a8-3b96-448b-b07a-7749e390ee28 b/docstore/3fce31a8-3b96-448b-b07a-7749e390ee28 new file mode 100644 index 0000000000000000000000000000000000000000..b0d24ed8267a7db2d3f856003571a245204928ff --- /dev/null +++ b/docstore/3fce31a8-3b96-448b-b07a-7749e390ee28 @@ -0,0 +1 @@ +voice name from the prebuilt output voices . This example saves the output audio from the model in a wave file: Python from google import genai from google.genai import types import wave # Set up the wave file to save the output: def wave_file ( filename , pcm , channels = 1 , rate = 24000 , sample_width = 2 ): with wave . open ( filename , "wb" ) as wf : wf . setnchannels ( channels ) wf . setsampwidth ( sample_width ) wf . setframerate ( rate ) wf . writeframes ( pcm ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = "Say cheerfully: Have a wonderful day!" , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), ) ) data = response . candidates [ 0 ] . content . parts [ 0 ] . inline_data . data file_name = 'out.wav' wave_file ( file_name , data ) # Saves the file to current directory For more code samples, refer to the "TTS - Get Started" file in the cookbooks repository: View on GitHub JavaScript import { GoogleGenAI } from '@google/genai' ; import wav from 'wav' ; async function saveWaveFile ( filename , pcmData , channels = 1 , rate = 24000 , sampleWidth = 2 , ) { return new Promise (( resolve , reject ) = > { const writer = new wav . FileWriter ( filename , { channels , sampleRate : rate , bitDepth : sampleWidth * 8 , }); writer . on ( 'finish' , resolve ); writer . on ( 'error' , reject ); writer . write ( pcmData ); writer . end (); }); } async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : [{ parts : [{ text : 'Say cheerfully: Have a wonderful day!' }] }], config : { responseModalities : [ 'AUDIO' ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Kore' }, }, }, }, }); \ No newline at end of file diff --git a/docstore/3fe3ce53-f19c-46bd-a28a-7e0566782e40 b/docstore/3fe3ce53-f19c-46bd-a28a-7e0566782e40 new file mode 100644 index 0000000000000000000000000000000000000000..9dd2717ee06a5cb6666e2976e86c3492640fe1f5 --- /dev/null +++ b/docstore/3fe3ce53-f19c-46bd-a28a-7e0566782e40 @@ -0,0 +1 @@ +'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Refer to the GenerateContentConfig in our API reference for a complete list of configurable parameters and their descriptions. Multimodal inputs The Gemini API supports multimodal inputs, allowing you to combine text with media files. The following example demonstrates providing an image: Python from PIL import Image from google import genai client = genai . Client () image = Image . open ( "/path/to/organ.png" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , "Tell me about this instrument" ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const image = await ai . files . upload ({ file : "/path/to/organ.png" , }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ createUserContent ([ "Tell me about this instrument" , createPartFromUri ( image . uri , image . mimeType ), ]), ], }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imagePath := "/path/to/organ.jpg" imgData , _ := os . ReadFile ( imagePath ) parts := [] * genai . Part { genai . NewPartFromText ( "Tell me about this instrument" ), & genai . Part { InlineData : & genai . Blob { MIMEType : "image/jpeg" , Data : imgData , \ No newline at end of file diff --git a/docstore/400de73d-abc3-46f3-9c61-db4b1ad5ffa9 b/docstore/400de73d-abc3-46f3-9c61-db4b1ad5ffa9 new file mode 100644 index 0000000000000000000000000000000000000000..b362fdd58ed7301c466f0b3a048e65a061fc1b90 --- /dev/null +++ b/docstore/400de73d-abc3-46f3-9c61-db4b1ad5ffa9 @@ -0,0 +1 @@ +"messages": [ {"role": "user", "content": "Explain to me how AI works"} ] }' Gemini thinking models also produce thought summaries and can use exact thinking budgets . You can use the extra_body field to include these fields in your request. Note that reasoning_effort and thinking_budget overlap functionality, so they can't be used at the same time. Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.5-flash" , messages = [{ "role" : "user" , "content" : "Explain to me how AI works" }], extra_body = { 'extra_body' : { "google" : { "thinking_config" : { "thinking_budget" : 800 , "include_thoughts" : True } } } } ) print ( response . choices [ 0 ] . message ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); const response = await openai . chat . completions . create ({ model : "gemini-2.5-flash" , messages : [{ role : "user" , content : "Explain to me how AI works" ,}], extra_body : { "google" : { "thinking_config" : { "thinking_budget" : 800 , "include_thoughts" : true } } } }); console . log ( response . choices [ 0 ]. message ); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.5-flash", "messages": [{"role": "user", "content": "Explain to me how AI works"}], "extra_body": { "google": { "thinking_config": { "include_thoughts": true } } } }' Streaming The Gemini API supports streaming responses . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = [ { \ No newline at end of file diff --git a/docstore/401f4790-6990-4ed7-b15b-351bdbfd0bac b/docstore/401f4790-6990-4ed7-b15b-351bdbfd0bac new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/401f4790-6990-4ed7-b15b-351bdbfd0bac @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/40236d6b-0cc1-427f-88f9-ca09f4d79da1 b/docstore/40236d6b-0cc1-427f-88f9-ca09f4d79da1 new file mode 100644 index 0000000000000000000000000000000000000000..448fb0d86f65fba6c1ebada31e49a4124dab31aa --- /dev/null +++ b/docstore/40236d6b-0cc1-427f-88f9-ca09f4d79da1 @@ -0,0 +1 @@ +message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "16000.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . log ( "Transcription" ) console . log ( turn . serverContent . outputTranscription . text ); } } for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } else if ( turn . serverContent && turn . serverContent . inputTranscription ) { console . debug ( 'Received input transcription: %s\n' , turn . serverContent . inputTranscription . text ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Stream audio and video To see an example of how to use the Live API in a streaming audio and video format, run the "Live API - Get Started" file in the cookbooks repository: View on Colab Change voice and language The Live API models each support a different set of voices. Half-cascade supports Puck, Charon, Kore, Fenrir, Aoede, Leda, Orus, and Zephyr. Native audio supports a much longer list (identical to the TTS model list \ No newline at end of file diff --git a/docstore/40579d36-003e-41df-99e8-c094fa7d2a87 b/docstore/40579d36-003e-41df-99e8-c094fa7d2a87 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/40579d36-003e-41df-99e8-c094fa7d2a87 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/405ba821-f2da-4a5f-b400-ef3eb5b1dd15 b/docstore/405ba821-f2da-4a5f-b400-ef3eb5b1dd15 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/405ba821-f2da-4a5f-b400-ef3eb5b1dd15 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/4077e78a-dfef-43a6-8392-527dfb169226 b/docstore/4077e78a-dfef-43a6-8392-527dfb169226 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/4077e78a-dfef-43a6-8392-527dfb169226 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/408c05c7-659e-4ce6-9014-beb5e672d110 b/docstore/408c05c7-659e-4ce6-9014-beb5e672d110 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/408c05c7-659e-4ce6-9014-beb5e672d110 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/4099563f-7392-444b-be92-62ffd608c0ce b/docstore/4099563f-7392-444b-be92-62ffd608c0ce new file mode 100644 index 0000000000000000000000000000000000000000..08922eb1e5da83e7a67a2a4aeaf4437890d1333a --- /dev/null +++ b/docstore/4099563f-7392-444b-be92-62ffd608c0ce @@ -0,0 +1 @@ +trademark of Oracle and/or its affiliates. Last updated 2025-05-31 UTC. \ No newline at end of file diff --git a/docstore/40a44b74-d644-48d6-b3cf-053c48f87d5d b/docstore/40a44b74-d644-48d6-b3cf-053c48f87d5d new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/40a44b74-d644-48d6-b3cf-053c48f87d5d @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/40a6a34a-ff6c-41b3-b495-267d2ad4532f b/docstore/40a6a34a-ff6c-41b3-b495-267d2ad4532f new file mode 100644 index 0000000000000000000000000000000000000000..aef01da97801860cabcd3fb68af1ef57ccf11af0 --- /dev/null +++ b/docstore/40a6a34a-ff6c-41b3-b495-267d2ad4532f @@ -0,0 +1 @@ +Speech generation (text-to-speech) | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Speech generation (text-to-speech) The Gemini API can transform text input into single speaker or multi-speaker audio using native text-to-speech (TTS) generation capabilities. Text-to-speech (TTS) generation is controllable , meaning you can use natural language to structure interactions and guide the style , accent , pace , and tone of the audio. The TTS capability differs from speech generation provided through the Live API , which is designed for interactive, unstructured audio, and multimodal inputs and outputs. While the Live API excels in dynamic conversational contexts, TTS through the Gemini API is tailored for scenarios that require exact text recitation with fine-grained control over style and sound, such as podcast or audiobook generation. This guide shows you how to generate single-speaker and multi-speaker audio from text. Preview: Native text-to-speech (TTS) is in Preview . Before you begin Ensure you use a Gemini 2.5 model variant with native text-to-speech (TTS) capabilities, as listed in the Supported models section. For optimal results, consider which model best fits your specific use case. You may find it useful to test the Gemini 2.5 TTS models in AI Studio before you start building. Note: TTS models accept text-only inputs and produce audio-only outputs. For a complete list of restrictions specific to TTS models, review the Limitations section. Single-speaker text-to-speech To convert text to single-speaker audio, set the response modality to "audio", and pass a SpeechConfig object with VoiceConfig set. You'll need to choose a \ No newline at end of file diff --git a/docstore/40afb3fd-65c2-4757-b6d4-df0f19e14d1f b/docstore/40afb3fd-65c2-4757-b6d4-df0f19e14d1f new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/40afb3fd-65c2-4757-b6d4-df0f19e14d1f @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/40b1df01-c213-4517-bf73-30bad9af419f b/docstore/40b1df01-c213-4517-bf73-30bad9af419f new file mode 100644 index 0000000000000000000000000000000000000000..7a617ceacc5e968d9729ffe6ff8f1e15b90d626d --- /dev/null +++ b/docstore/40b1df01-c213-4517-bf73-30bad9af419f @@ -0,0 +1 @@ +multiple attempts yield the best results. Keep it short : Limit text to 25 characters or less for optimal generation. Multiple phrases : Experiment with two or three distinct phrases to provide additional information. Avoid exceeding three phrases for cleaner compositions. Prompt: A poster with the text "Summerland" in bold font as a title, underneath this text is the slogan "Summer never felt so good" Guide Placement : While Imagen can attempt to position text as directed, expect occasional variations. This feature is continually improving. Inspire font style : Specify a general font style to subtly influence Imagen's choices. Don't rely on precise font replication, but expect creative interpretations. Font size : Specify a font size or a general indication of size (for example, small , medium , large ) to influence the font size generation. Prompt parameterization To better control output results, you might find it helpful to parameterize the inputs into Imagen. For example, suppose you want your customers to be able to generate logos for their business, and you want to make sure logos are always generated on a solid color background. You also want to limit the options that the client can select from a menu. In this example, you can create a parameterized prompt similar to the following: A {logo_style} logo for a {company_area} company on a solid color background. Include the text {company_name} . In your custom user interface, the customer can input the parameters using a menu, and their chosen value populates the prompt Imagen receives. For example: Prompt: A minimalist logo for a health care company on a solid color background. Include the text Journey . Prompt: A modern logo for a software company on a solid color background. Include the text Silo . Prompt: A traditional logo for a baking company on a solid color background. Include the text Seed . Advanced prompt writing techniques Use the following examples to create more specific prompts based on attributes \ No newline at end of file diff --git a/docstore/40b48d98-f18a-48de-be23-312fe92f51e0 b/docstore/40b48d98-f18a-48de-be23-312fe92f51e0 new file mode 100644 index 0000000000000000000000000000000000000000..ed09bf86b4b3896290a2372bddef4006c085c60d --- /dev/null +++ b/docstore/40b48d98-f18a-48de-be23-312fe92f51e0 @@ -0,0 +1 @@ +Image generation | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image generation You can generate images using the Gemini API with either Gemini's built-in multimodal capabilities or Imagen, Google's specialized image generation models. For most use cases, start with Gemini . Choose Imagen for specialized tasks where image quality is critical. See Choosing the right model section for more guidance. All generated images include a SynthID watermark . Before you begin Ensure you use a supported model and version for image generation: For Gemini , use Gemini 2.0 Flash Preview Image Generation. For Imagen , use one of the Imagen models (Imagen 3, Imagen 4 or Imagen 4 Ultra). Note that those models are only available on the Paid tier . You can access both Gemini and Imagen models using the same libraries. Note: Image generation may not be available in all regions and countries, review our Models page for more information. Generate images using Gemini Gemini can generate and process images conversationally. You can prompt Gemini with text, images, or a combination of both to achieve various image-related tasks, such as image generation and editing. You must include responseModalities : ["TEXT", "IMAGE"] in your configuration. Image-only output is not supported with these models. Image generation (text-to-image) The following code demonstrates how to generate an image based on a descriptive prompt: Python from google import genai from google.genai import types from PIL import Image from io import BytesIO import base64 client = genai . Client () contents = ( 'Hi, can you create a 3d rendered image of a pig ' 'with wings and a top hat flying \ No newline at end of file diff --git a/docstore/40b6d488-65d0-4170-aa49-0bddff8b36d8 b/docstore/40b6d488-65d0-4170-aa49-0bddff8b36d8 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/40b6d488-65d0-4170-aa49-0bddff8b36d8 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/40cc3cf4-9939-4a2a-8835-7100a07a9bcf b/docstore/40cc3cf4-9939-4a2a-8835-7100a07a9bcf new file mode 100644 index 0000000000000000000000000000000000000000..65512c01f777d6e5e3b37a55514cc97b01015837 --- /dev/null +++ b/docstore/40cc3cf4-9939-4a2a-8835-7100a07a9bcf @@ -0,0 +1 @@ +Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Affective dialog This feature lets Gemini adapt its response style to the input expression and tone. To use affective dialog, set the api version to v1alpha and set enable_affective_dialog to true in the setup message: Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], enable_affective_dialog = True ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], enableAffectiveDialog : true }; Note that affective dialog is currently only supported by the native audio output models. Proactive audio When this feature is enabled, Gemini can proactively decide not to respond if the content is not relevant. To use it, set the api version to v1alpha and configure the proactivity field in the setup message and set proactive_audio to true : Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], proactivity = { 'proactive_audio' : True } ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], proactivity : { proactiveAudio : true } } Note that proactive audio is currently only supported by the native audio output models. Native audio output with thinking Native audio output supports thinking capabilities , available via a separate model gemini-2.5-flash-exp-native-audio-thinking-dialog . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-exp-native-audio-thinking-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio \ No newline at end of file diff --git a/docstore/40cd5f26-4b91-4a3d-8037-2bba3795f4a8 b/docstore/40cd5f26-4b91-4a3d-8037-2bba3795f4a8 new file mode 100644 index 0000000000000000000000000000000000000000..56f7dcc9bceb6e3744499a44a4d77c57b76426e8 --- /dev/null +++ b/docstore/40cd5f26-4b91-4a3d-8037-2bba3795f4a8 @@ -0,0 +1 @@ += "gemini-2.5-flash" , contents = [ "Explain how AI works" ], config = types . GenerateContentConfig ( temperature = 0.1 ) ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , config : { temperature : 0.1 , }, }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } temp := float32 ( 0.9 ) topP := float32 ( 0.5 ) topK := float32 ( 20.0 ) config := & genai . GenerateContentConfig { Temperature : & temp , TopP : & topP , TopK : & topK , ResponseMIMEType : "application/json" , } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "What is the average size of a swallow?" ), config , ) fmt . Println ( result . Text ()) } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ], "generationConfig": { "stopSequences": [ "Title" ], "temperature": 1.0, "topP": 0.8, "topK": 10 } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const generationConfig = { temperature : 1 , topP : 0.95 , topK : 40 , responseMimeType : 'text/plain' , }; const payload = { generationConfig , contents : [ { parts : [ { text : 'Explain how AI works in a few words' }, ], }, ], }; const url = \ No newline at end of file diff --git a/docstore/40f51cc5-656d-4f2f-b300-0bd91b5c2f33 b/docstore/40f51cc5-656d-4f2f-b300-0bd91b5c2f33 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/40f51cc5-656d-4f2f-b300-0bd91b5c2f33 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/40ff8d74-bcfe-4fd6-b684-cbacbf3821d3 b/docstore/40ff8d74-bcfe-4fd6-b684-cbacbf3821d3 new file mode 100644 index 0000000000000000000000000000000000000000..9a406d6d652b4766d7f38fc77bc77aa7dbb4036f --- /dev/null +++ b/docstore/40ff8d74-bcfe-4fd6-b684-cbacbf3821d3 @@ -0,0 +1 @@ +Context caching | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Context caching Python JavaScript Go REST In a typical AI workflow, you might pass the same input tokens over and over to a model. The Gemini API offers two different caching mechanisms: Implicit caching (automatically enabled on Gemini 2.5 models, no cost saving guarantee) Explicit caching (can be manually enabled on most models, cost saving guarantee) Explicit caching is useful in cases where you want to guarantee cost savings, but with some added developer work. Implicit caching Implicit caching is enabled by default for all Gemini 2.5 models. We automatically pass on cost savings if your request hits caches. There is nothing you need to do in order to enable this. It is effective as of May 8th, 2025. The minimum input token count for context caching is 1,024 for 2.5 Flash and 2,048 for 2.5 Pro. To increase the chance of an implicit cache hit: Try putting large and common contents at the beginning of your prompt Try to send requests with similar prefix in a short amount of time You can see the number of tokens which were cache hits in the response object's usage_metadata field. Explicit caching Using the Gemini API explicit caching feature, you can pass some content to the model once, cache the input tokens, and then refer to the cached tokens for subsequent requests. At certain volumes, using cached tokens is lower cost than passing in the same corpus of tokens repeatedly. When you cache a set of tokens, you can choose how long you want the cache to exist before the tokens are automatically deleted. This caching duration is called the time to live (TTL). If not set, \ No newline at end of file diff --git a/docstore/41285443-d711-4487-9a78-a0b1dcb2e4ec b/docstore/41285443-d711-4487-9a78-a0b1dcb2e4ec new file mode 100644 index 0000000000000000000000000000000000000000..771c2c741948f29f5c3605e7090d7f1d54bfcf1f --- /dev/null +++ b/docstore/41285443-d711-4487-9a78-a0b1dcb2e4ec @@ -0,0 +1 @@ +For example, assume that you're developing an application to classify musical instruments into one of five categories: "Percussion" , "String" , "Woodwind" , "Brass" , or " "Keyboard" ". You could create an enum to help with this task. In the following example, you pass an enum as the responseSchema , constraining the model to choose the most appropriate option. Python from google import genai import enum class Instrument ( enum . Enum ): PERCUSSION = "Percussion" STRING = "String" WOODWIND = "Woodwind" BRASS = "Brass" KEYBOARD = "Keyboard" client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = 'What type of instrument is an oboe?' , config = { 'response_mime_type' : 'text/x.enum' , 'response_schema' : Instrument , }, ) print ( response . text ) # Woodwind JavaScript import { GoogleGenAI , Type } from "@google/genai" ; const ai = new GoogleGenAI ({}); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "What type of instrument is an oboe?" , config : { responseMimeType : "text/x.enum" , responseSchema : { type : Type . STRING , enum : [ "Percussion" , "String" , "Woodwind" , "Brass" , "Keyboard" ], }, }, }); console . log ( response . text ); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "contents": [{ "parts":[ { "text": "What type of instrument is an oboe?" } ] }], "generationConfig": { "responseMimeType": "text/x.enum", "responseSchema": { "type": "STRING", "enum": ["Percussion", "String", "Woodwind", "Brass", "Keyboard"] } } }' The Python library will translate the type declarations for the API. However, the API accepts a subset of the OpenAPI 3.0 schema ( Schema ). There are two other ways to specify an enumeration. You can use a Literal : ``` Python Literal [ "Percussion" , "String" , "Woodwind" , "Brass" , "Keyboard" ] \ No newline at end of file diff --git a/docstore/41349a26-b433-4505-a734-9203ce5c419c b/docstore/41349a26-b433-4505-a734-9203ce5c419c new file mode 100644 index 0000000000000000000000000000000000000000..cd1a6469d32d4344455628e24b4f24d47cbf3ee6 --- /dev/null +++ b/docstore/41349a26-b433-4505-a734-9203ce5c419c @@ -0,0 +1 @@ +new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "List a few popular cookie recipes, and include the amounts of ingredients." , config : { responseMimeType : "application/json" , responseSchema : { type : Type . ARRAY , items : { type : Type . OBJECT , properties : { recipeName : { type : Type . STRING , }, ingredients : { type : Type . ARRAY , items : { type : Type . STRING , }, }, }, propertyOrdering : [ "recipeName" , "ingredients" ], }, }, }, }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { ResponseMIMEType : "application/json" , ResponseSchema : & genai . Schema { Type : genai . TypeArray , Items : & genai . Schema { Type : genai . TypeObject , Properties : map [ string ] * genai . Schema { "recipeName" : { Type : genai . TypeString }, "ingredients" : { Type : genai . TypeArray , Items : & genai . Schema { Type : genai . TypeString }, }, }, PropertyOrdering : [] string { "recipeName" , "ingredients" }, }, }, } result , err := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "List a few popular cookie recipes, and include the amounts of ingredients." ), config , ) if err != nil { log . Fatal ( err ) } fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "contents": [{ "parts":[ { "text": "List a few popular cookie recipes, and include the amounts of ingredients." } ] }], "generationConfig": { "responseMimeType": "application/json", "responseSchema": { "type": "ARRAY", "items": { "type": "OBJECT", "properties": { "recipeName": { \ No newline at end of file diff --git a/docstore/413cc5a5-50c5-4ee0-8b6d-63678559901a b/docstore/413cc5a5-50c5-4ee0-8b6d-63678559901a new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/413cc5a5-50c5-4ee0-8b6d-63678559901a @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/417676f1-650f-4be9-b205-0f087c2324e8 b/docstore/417676f1-650f-4be9-b205-0f087c2324e8 new file mode 100644 index 0000000000000000000000000000000000000000..6a4622a97ffb0b0371d502ca7df8c32e8b1dccc9 --- /dev/null +++ b/docstore/417676f1-650f-4be9-b205-0f087c2324e8 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models#gemini-1.5-pro Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/417b8129-5a43-4a7b-ba19-3806c192f20d b/docstore/417b8129-5a43-4a7b-ba19-3806c192f20d new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/417b8129-5a43-4a7b-ba19-3806c192f20d @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/4184a8b5-922e-44a0-b381-9e835b5cdfdd b/docstore/4184a8b5-922e-44a0-b381-9e835b5cdfdd new file mode 100644 index 0000000000000000000000000000000000000000..fde5008e10da059aa2ac847e9fab5e369116574b --- /dev/null +++ b/docstore/4184a8b5-922e-44a0-b381-9e835b5cdfdd @@ -0,0 +1 @@ +You can set fields as required to force the model to provide a value. If there's insufficient context in the associated input prompt, the model generates responses mainly based on the data it was trained on. A complex schema can result in an InvalidArgument: 400 error. Complexity might come from long property names, long array length limits, enums with many values, objects with lots of optional properties, or a combination of these factors. If you get this error with a valid schema, make one or more of the following changes to resolve the error: Shorten property names or enum names. Flatten nested arrays. Reduce the number of properties with constraints, such as numbers with minimum and maximum limits. Reduce the number of properties with complex constraints, such as properties with complex formats like date-time . Reduce the number of optional properties. Reduce the number of valid values for enums. If you aren't seeing the results you expect, add more context to your input prompts or revise your response schema. For example, review the model's response without structured output to see how the model responds. You can then update your response schema so that it better fits the model's output. What's next Now that you've learned how to generate structured output, you might want to try using Gemini API tools: Function calling Code execution Grounding with Google Search Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/41a0d5c3-2347-4af8-9f3b-4172e28a3e03 b/docstore/41a0d5c3-2347-4af8-9f3b-4172e28a3e03 new file mode 100644 index 0000000000000000000000000000000000000000..e7df3841ca47d093238d5ecfeb9f7cc95942f8e5 --- /dev/null +++ b/docstore/41a0d5c3-2347-4af8-9f3b-4172e28a3e03 @@ -0,0 +1 @@ +data image2Path := "path/to/image2.jpeg" imgBytes , _ := os . ReadFile ( image2Path ) parts := [] * genai . Part { genai . NewPartFromText ( "What is different between these two images?" ), genai . NewPartFromBytes ( imgBytes , "image/jpeg" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST # Upload the first image IMAGE1_PATH = "path/to/image1.jpg" MIME1_TYPE = $( file -b --mime-type " ${ IMAGE1_PATH } " ) NUM1_BYTES = $( wc -c < " ${ IMAGE1_PATH } " ) DISPLAY_NAME1 = IMAGE1 tmp_header_file1 = upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME1_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME1 } '}}" 2 > /dev/null upload_url1 = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file1 } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file1 } " curl " ${ upload_url1 } " \ -H "Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE1_PATH } " 2 > /dev/null > file_info1.json file1_uri = $( jq ".file.uri" file_info1.json ) echo file1_uri = $file1_uri # Prepare the second image (inline) IMAGE2_PATH = "path/to/image2.png" MIME2_TYPE = $( file -b --mime-type " ${ IMAGE2_PATH } " ) if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMAGE2_BASE64 = $( base64 $B64FLAGS $IMAGE2_PATH ) # Now generate content using both images curl \ No newline at end of file diff --git a/docstore/41a5750c-1f1d-49b4-b8e8-e2ffab67057e b/docstore/41a5750c-1f1d-49b4-b8e8-e2ffab67057e new file mode 100644 index 0000000000000000000000000000000000000000..cb7319bb995c708a315638a1177ba448f5c39210 --- /dev/null +++ b/docstore/41a5750c-1f1d-49b4-b8e8-e2ffab67057e @@ -0,0 +1 @@ +: "Powers the spinning disco ball." , "parameters" : { "type" : "object" , "properties" : { "power" : { "type" : "boolean" , "description" : "Whether to turn the disco ball on or off." , } }, "required" : [ "power" ], }, } start_music = { "name" : "start_music" , "description" : "Play some music matching the specified parameters." , "parameters" : { "type" : "object" , "properties" : { "energetic" : { "type" : "boolean" , "description" : "Whether the music is energetic or not." , }, "loud" : { "type" : "boolean" , "description" : "Whether the music is loud or not." , }, }, "required" : [ "energetic" , "loud" ], }, } dim_lights = { "name" : "dim_lights" , "description" : "Dim the lights." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "number" , "description" : "The brightness of the lights, 0.0 is off, 1.0 is full." , } }, "required" : [ "brightness" ], }, } JavaScript import { Type } from '@google/genai' ; const powerDiscoBall = { name : 'power_disco_ball' , description : 'Powers the spinning disco ball.' , parameters : { type : Type . OBJECT , properties : { power : { type : Type . BOOLEAN , description : 'Whether to turn the disco ball on or off.' } }, required : [ 'power' ] } }; const startMusic = { name : 'start_music' , description : 'Play some music matching the specified parameters.' , parameters : { type : Type . OBJECT , properties : { energetic : { type : Type . BOOLEAN , description : 'Whether the music is energetic or not.' }, loud : { type : Type . BOOLEAN , description : 'Whether the music is loud or not.' } }, required : [ 'energetic' , 'loud' ] } }; const dimLights = { name : 'dim_lights' , description : 'Dim the lights.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'The brightness of the lights, 0.0 is off, 1.0 is full.' } }, required : [ 'brightness' ] } }; Configure the function calling mode to allow using all of the specified tools. To learn more, \ No newline at end of file diff --git a/docstore/41ae905d-7087-42ee-87f4-c785561c068f b/docstore/41ae905d-7087-42ee-87f4-c785561c068f new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/41ae905d-7087-42ee-87f4-c785561c068f @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/41e3d518-7688-4049-a60c-8ad54d5031d8 b/docstore/41e3d518-7688-4049-a60c-8ad54d5031d8 new file mode 100644 index 0000000000000000000000000000000000000000..7413252ebdd25d38cf2330833a4ae08ecf178fd2 --- /dev/null +++ b/docstore/41e3d518-7688-4049-a60c-8ad54d5031d8 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting-strategies#main-content Title: Prompt design strategies | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/41e8ce81-7e52-494c-bb35-d5186b918584 b/docstore/41e8ce81-7e52-494c-bb35-d5186b918584 new file mode 100644 index 0000000000000000000000000000000000000000..a97f8e9b4e6ce1d85f8c4b9bed31a9676bb7c602 --- /dev/null +++ b/docstore/41e8ce81-7e52-494c-bb35-d5186b918584 @@ -0,0 +1 @@ +Files API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Files API The Gemini family of artificial intelligence (AI) models is built to handle various types of input data, including text, images, and audio. Since these models can handle more than one type or mode of data, the Gemini models are called multimodal models or explained as having multimodal capabilities . This guide shows you how to work with media files using the Files API. The basic operations are the same for audio files, images, videos, documents, and other supported file types. For file prompting guidance, check out the File prompt guide section. Upload a file You can use the Files API to upload a media file. Always use the Files API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20 MB. The following code uploads a file and then uses the file in a call to generateContent . Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp3" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ "Describe this audio clip" , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Describe this audio \ No newline at end of file diff --git a/docstore/41ee67ea-fdcf-4327-b7cc-c68bd8bb96a9 b/docstore/41ee67ea-fdcf-4327-b7cc-c68bd8bb96a9 new file mode 100644 index 0000000000000000000000000000000000000000..142d44299675e72008ee076f6f6fed64a9d19949 --- /dev/null +++ b/docstore/41ee67ea-fdcf-4327-b7cc-c68bd8bb96a9 @@ -0,0 +1 @@ +charge $0.50 (text) Output price Free of charge $10.00 (audio) Used to improve our products Yes No Gemini 2.5 Pro Preview TTS Try it in Google AI Studio Our 2.5 Pro text-to-speech audio model optimized for powerful, low-latency speech generation for more natural outputs and easier to steer prompts. Preview models may change before becoming stable and have more restrictive rate limits. Free Tier Paid Tier, per 1M tokens in USD Input price Not available $1.00 (text) Output price Not available $20.00 (audio) Used to improve our products Yes No Gemini 2.0 Flash Try it in Google AI Studio Our most balanced multimodal model with great performance across all tasks, with a 1 million token context window, and built for the era of Agents. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge $0.10 (text / image / video) $0.70 (audio) Output price Free of charge $0.40 Context caching price Free of charge $0.025 / 1,000,000 tokens (text/image/video) $0.175 / 1,000,000 tokens (audio) Context caching (storage) Free of charge, up to 1,000,000 tokens of storage per hour $1.00 / 1,000,000 tokens per hour Image generation pricing Free of charge $0.039 per image* Tuning price Not available Not available Grounding with Google Search Free of charge, up to 500 RPD 1,500 RPD (free), then $35 / 1,000 requests Live API Free of charge Input: $0.35 (text), $2.10 (audio / image [video]) Output: $1.50 (text), $8.50 (audio) Used to improve our products Yes No [*] Image output is priced at $30 per 1,000,000 tokens. Output images up to 1024x1024px consume 1290 tokens and are equivalent to $0.039 per image. Gemini 2.0 Flash-Lite Try it in Google AI Studio Our smallest and most cost effective model, built for at scale usage. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge $0.075 Output price Free of charge $0.30 Context caching price Not available Not available Context caching (storage) Not available Not available Tuning price Not available Not available Grounding \ No newline at end of file diff --git a/docstore/41efedb7-f192-4d61-94cf-917e94f79b86 b/docstore/41efedb7-f192-4d61-94cf-917e94f79b86 new file mode 100644 index 0000000000000000000000000000000000000000..203c1e44cd8be8f0144d13a11f43fb822930ab15 --- /dev/null +++ b/docstore/41efedb7-f192-4d61-94cf-917e94f79b86 @@ -0,0 +1 @@ +] // MCP Server }); const client = new Client ( { name : "example-client" , version : "1.0.0" } ); // Configure the client const ai = new GoogleGenAI ({}); // Initialize the connection between client and server await client . connect ( serverParams ); // Send request to the model with MCP tools const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : `What is the weather in London in ${ new Date (). toLocaleDateString () } ?` , config : { tools : [ mcpToTool ( client )], // uses the session, will automatically call the tool // Uncomment if you **don't** want the sdk to automatically call the tool // automaticFunctionCalling: { // disable: true, // }, }, }); console . log ( response . text ) // Close the connection await client . close (); Limitations with built-in MCP support Built-in MCP support is a experimental feature in our SDKs and has the following limitations: Only tools are supported, not resources nor prompts It is available for the Python and JavaScript/TypeScript SDK. Breaking changes might occur in future releases. Manual integration of MCP servers is always an option if these limit what you're building. Supported models This section lists models and their function calling capabilities. Experimental models are not included. You can find a comprehensive capabilities overview on the model overview page. Model Function Calling Parallel Function Calling Compositional Function Calling Gemini 2.5 Pro ✔️ ✔️ ✔️ Gemini 2.5 Flash ✔️ ✔️ ✔️ Gemini 2.5 Flash-Lite ✔️ ✔️ ✔️ Gemini 2.0 Flash ✔️ ✔️ ✔️ Gemini 2.0 Flash-Lite X X X Best practices Function and Parameter Descriptions: Be extremely clear and specific in your descriptions. The model relies on these to choose the correct function and provide appropriate arguments. Naming: Use descriptive function names (without spaces, periods, or dashes). Strong Typing: Use specific types (integer, string, enum) for parameters to reduce errors. If a parameter has a limited set of valid \ No newline at end of file diff --git a/docstore/422710cb-00f8-4447-a966-b362d2c2dce0 b/docstore/422710cb-00f8-4447-a966-b362d2c2dce0 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/422710cb-00f8-4447-a966-b362d2c2dce0 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/42274b8e-cb62-4ca8-893c-90ed4c1f5481 b/docstore/42274b8e-cb62-4ca8-893c-90ed4c1f5481 new file mode 100644 index 0000000000000000000000000000000000000000..4a588ba82271aaa08f2df12295049c23bd57f7eb --- /dev/null +++ b/docstore/42274b8e-cb62-4ca8-893c-90ed4c1f5481 @@ -0,0 +1 @@ +, ) print ( response . text ) # The SDK handles the function call and returns the final text You can disable automatic function calling with: Python config = types . GenerateContentConfig ( tools = [ get_current_temperature ], automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ) ) Automatic function schema declaration Automatic schema extraction from Python functions doesn't work in all cases. For example, it doesn't handle cases where you describe the fields of a nested dictionary-object. The API is able to describe any of the following types: Python AllowedType = ( int | float | bool | str | list [ 'AllowedType' ] | dict [ str , AllowedType ]) To see what the inferred schema looks like, you can convert it using from_callable : Python def multiply ( a : float , b : float ): """Returns a * b.""" return a * b fn_decl = types . FunctionDeclaration . from_callable ( callable = multiply , client = client ) # to_json_dict() provides a clean JSON representation. print ( fn_decl . to_json_dict ()) Multi-tool use: Combine native tools with function calling You can enable multiple tools combining native tools with function calling at the same time. Here's an example that enables two tools, Grounding with Google Search and code execution , in a request using the Live API . Note: Multi-tool use is a- Live API only feature at the moment. The run() function declaration, which handles the asynchronous websocket setup, is omitted for brevity. Python # Multiple tasks example - combining lights, code execution, and search prompt = """ Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! """ tools = [ { 'google_search' : {}}, { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} # not defined here. \ No newline at end of file diff --git a/docstore/422fb9f1-1b2f-4638-900d-7c7c643a5a1e b/docstore/422fb9f1-1b2f-4638-900d-7c7c643a5a1e new file mode 100644 index 0000000000000000000000000000000000000000..ba28dbc7ce7bec083cb930373e6fd594666aa933 --- /dev/null +++ b/docstore/422fb9f1-1b2f-4638-900d-7c7c643a5a1e @@ -0,0 +1 @@ +my house?" , }); console . log ( "Chat response 2:" , response2 . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) res , _ := chat . SendMessage ( ctx , genai . Part { Text : "How many paws are in my house?" }) if len ( res . Candidates ) > 0 { fmt . Println ( res . Candidates [ 0 ]. Content . Parts [ 0 ]. Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' \ No newline at end of file diff --git a/docstore/42307dfc-bbb4-4329-8533-a1d237abba5c b/docstore/42307dfc-bbb4-4329-8533-a1d237abba5c new file mode 100644 index 0000000000000000000000000000000000000000..bf98246a4d5f20dab4e649ac0598b2bfac1851f5 --- /dev/null +++ b/docstore/42307dfc-bbb4-4329-8533-a1d237abba5c @@ -0,0 +1 @@ +" ) REST tmp_batch_input_file = batch_input.tmp echo -e '{"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}], "generationConfig": {"temperature": 0.7}}\n{"contents": [{"parts": [{"text": "What are the main ingredients in a Margherita pizza?"}]}]}' > batch_input.tmp MIME_TYPE = $( file -b --mime-type " ${ tmp_batch_input_file } " ) NUM_BYTES = $( wc -c < " ${ tmp_batch_input_file } " ) DISPLAY_NAME = BatchInput tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files \ -D " ${ tmp_header_file } " \ -H " x-goog-api-key: $GEMINI_API_KEY " \ -H " X-Goog-Upload-Protocol: resumable " \ -H " X-Goog-Upload-Command: start " \ -H " X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H " X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H " Content-Type: application/jsonl " \ -d " { 'file' : { 'display_name' : '${DISPLAY_NAME}' }} " 2> /dev/null upload_url= $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H " Content-Length: ${ NUM_BYTES } " \ -H " X-Goog-Upload-Offset: 0 " \ -H " X-Goog-Upload-Command: upload, finalize " \ --data-binary " @ ${ tmp_batch_input_file } " 2> /dev/null > file_info.json file_uri= $( jq ".file.uri" file_info.json ) The following example calls the BatchGenerateContent method with the input file uploaded using File API: Python # Assumes `uploaded_file` is the file object from the previous step file_batch_job = client . batches . create ( model = "gemini-2.5-flash" , src = uploaded_file . name , config = { 'display_name' : "file-upload-job-1" , }, ) print ( f "Created batch job: { file_batch_job . name } " ) REST BATCH_INPUT_FILE = 'files/123456' # File ID curl \ No newline at end of file diff --git a/docstore/42316ae7-9f65-49e9-9354-e3dc9f5184e0 b/docstore/42316ae7-9f65-49e9-9354-e3dc9f5184e0 new file mode 100644 index 0000000000000000000000000000000000000000..bf98246a4d5f20dab4e649ac0598b2bfac1851f5 --- /dev/null +++ b/docstore/42316ae7-9f65-49e9-9354-e3dc9f5184e0 @@ -0,0 +1 @@ +" ) REST tmp_batch_input_file = batch_input.tmp echo -e '{"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}], "generationConfig": {"temperature": 0.7}}\n{"contents": [{"parts": [{"text": "What are the main ingredients in a Margherita pizza?"}]}]}' > batch_input.tmp MIME_TYPE = $( file -b --mime-type " ${ tmp_batch_input_file } " ) NUM_BYTES = $( wc -c < " ${ tmp_batch_input_file } " ) DISPLAY_NAME = BatchInput tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files \ -D " ${ tmp_header_file } " \ -H " x-goog-api-key: $GEMINI_API_KEY " \ -H " X-Goog-Upload-Protocol: resumable " \ -H " X-Goog-Upload-Command: start " \ -H " X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H " X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H " Content-Type: application/jsonl " \ -d " { 'file' : { 'display_name' : '${DISPLAY_NAME}' }} " 2> /dev/null upload_url= $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H " Content-Length: ${ NUM_BYTES } " \ -H " X-Goog-Upload-Offset: 0 " \ -H " X-Goog-Upload-Command: upload, finalize " \ --data-binary " @ ${ tmp_batch_input_file } " 2> /dev/null > file_info.json file_uri= $( jq ".file.uri" file_info.json ) The following example calls the BatchGenerateContent method with the input file uploaded using File API: Python # Assumes `uploaded_file` is the file object from the previous step file_batch_job = client . batches . create ( model = "gemini-2.5-flash" , src = uploaded_file . name , config = { 'display_name' : "file-upload-job-1" , }, ) print ( f "Created batch job: { file_batch_job . name } " ) REST BATCH_INPUT_FILE = 'files/123456' # File ID curl \ No newline at end of file diff --git a/docstore/423f93b4-9912-4356-bf2d-9a3e961da7eb b/docstore/423f93b4-9912-4356-bf2d-9a3e961da7eb new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/423f93b4-9912-4356-bf2d-9a3e961da7eb @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/4246553d-a40e-42a7-abea-bf5f7c949197 b/docstore/4246553d-a40e-42a7-abea-bf5f7c949197 new file mode 100644 index 0000000000000000000000000000000000000000..8759a03a20a3177c7734cd1638fb9c60e8d9d57e --- /dev/null +++ b/docstore/4246553d-a40e-42a7-abea-bf5f7c949197 @@ -0,0 +1 @@ +popularized by short form video apps (for example, YouTube shorts). Use this for tall objects with strong vertical orientations such as buildings, trees, waterfalls, or other similar objects. Prompt: a digital render of a massive skyscraper, modern, grand, epic with a beautiful sunset in the background (9:16 aspect ratio) Photorealistic images Different versions of the image generation model might offer a mix of artistic and photorealistic output. Use the following wording in prompts to generate more photorealistic output, based on the subject you want to generate. Note: Take these keywords as general guidance when you try to create photorealistic images. They aren't required to achieve your goal. Use case Lens type Focal lengths Additional details People (portraits) Prime, zoom 24-35mm black and white film, Film noir, Depth of field, duotone (mention two colors) Food, insects, plants (objects, still life) Macro 60-105mm High detail, precise focusing, controlled lighting Sports, wildlife (motion) Telephoto zoom 100-400mm Fast shutter speed, Action or movement tracking Astronomical, landscape (wide-angle) Wide-angle 10-24mm Long exposure times, sharp focus, long exposure, smooth water or clouds Portraits Use case Lens type Focal lengths Additional details People (portraits) Prime, zoom 24-35mm black and white film, Film noir, Depth of field, duotone (mention two colors) Using several keywords from the table, Imagen can generate the following portraits: Prompt: A woman, 35mm portrait, blue and grey duotones Model: imagen-3.0-generate-002 Prompt: A woman, 35mm portrait, film noir Model: imagen-3.0-generate-002 Objects Use case Lens type Focal lengths Additional details Food, insects, plants (objects, still life) Macro 60-105mm High detail, precise focusing, controlled lighting Using several keywords from the table, Imagen can generate the following object images: Prompt: leaf of a prayer plant, macro lens, 60mm Model: imagen-3.0-generate-002 Prompt: a plate of pasta, \ No newline at end of file diff --git a/docstore/42619ad4-8ceb-425e-a657-e5838a55ca07 b/docstore/42619ad4-8ceb-425e-a657-e5838a55ca07 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/42619ad4-8ceb-425e-a657-e5838a55ca07 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/427a9863-469c-4b3f-8183-ca12585c59cf b/docstore/427a9863-469c-4b3f-8183-ca12585c59cf new file mode 100644 index 0000000000000000000000000000000000000000..4635be3508ca0afda1003e879ec15abc83b5d7a7 --- /dev/null +++ b/docstore/427a9863-469c-4b3f-8183-ca12585c59cf @@ -0,0 +1 @@ +NewPartFromText ( "Summarize this document" ), } contents := [] * genai . Content { genai . NewContentFromParts ( promptParts , genai . RoleUser ), // Specify role } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST PDF_PATH = "https://www.nasa.gov/wp-content/uploads/static/history/alsj/a17/A17_FlightPlan.pdf" DISPLAY_NAME = "A17_FlightPlan" PROMPT = "Summarize this document" # Download the PDF from the provided URL wget -O " ${ DISPLAY_NAME } .pdf" " ${ PDF_PATH } " MIME_TYPE = $( file -b --mime-type " ${ DISPLAY_NAME } .pdf" ) NUM_BYTES = $( wc -c < " ${ DISPLAY_NAME } .pdf" ) echo "MIME_TYPE: ${ MIME_TYPE } " echo "NUM_BYTES: ${ NUM_BYTES } " tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files?key= ${ GOOGLE_API_KEY } " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ DISPLAY_NAME } .pdf" 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo "file_uri: ${ file_uri } " # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key= $GOOGLE_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "' $PROMPT '"}, \ No newline at end of file diff --git a/docstore/428fea2c-28ce-4941-8b4d-ccbc98bb230d b/docstore/428fea2c-28ce-4941-8b4d-ccbc98bb230d new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/428fea2c-28ce-4941-8b4d-ccbc98bb230d @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/42bf8c19-8c9c-48c7-a65e-840be84415cf b/docstore/42bf8c19-8c9c-48c7-a65e-840be84415cf new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/42bf8c19-8c9c-48c7-a65e-840be84415cf @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/42e945f3-54e7-431e-bb10-6a39e4ca91ac b/docstore/42e945f3-54e7-431e-bb10-6a39e4ca91ac new file mode 100644 index 0000000000000000000000000000000000000000..7c3d98af9909034f92832ab4dcc3a7220e5c9856 --- /dev/null +++ b/docstore/42e945f3-54e7-431e-bb10-6a39e4ca91ac @@ -0,0 +1 @@ +operation = await ai . models . generateVideos ({ model : "veo-2.0-generate-001" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , image : { imageBytes : response . generatedImages [ 0 ]. image . imageBytes , // response from Imagen mimeType : "image/png" , }, config : { aspectRatio : "16:9" , numberOfVideos : 2 , }, }); while ( ! operation . done ) { await new Promise (( resolve ) = > setTimeout ( resolve , 10000 )); operation = await ai . operations . getVideosOperation ({ operation : operation , }); } operation . response ? . generatedVideos ? . forEach ( async ( generatedVideo , n ) = > { const resp = await fetch ( ` ${ generatedVideo . video ? . uri } & key=GEMINI_API_KEY` , // append your API key ); const writer = createWriteStream ( `video ${ n } .mp4` ); Readable . fromWeb ( resp . body ). pipe ( writer ); }); } main (); Go image := response . GeneratedImages [ 0 ]. Image videoConfig := & genai . GenerateVideosConfig { AspectRatio : "16:9" , NumberOfVideos : 2 , } operation , _ := client . Models . GenerateVideos ( ctx , "veo-2.0-generate-001" , "A dramatic scene based on the input image" , image , videoConfig , ) for ! operation . Done { time . Sleep ( 20 * time . Second ) operation , _ = client . Operations . GetVideosOperation ( ctx , operation , nil ) } for n , video := range operation . Response . GeneratedVideos { client . Files . Download ( ctx , video . Video , nil ) fname := fmt . Sprintf ( "video_with_image_input_%d.mp4" , n ) _ = os . WriteFile ( fname , video . Video . VideoBytes , 0644 ) } Veo model parameters (Naming conventions vary by programming language.) prompt : The text prompt for the video. When present, the image parameter is optional. image : The image to use as the first frame for the video. When present, the prompt parameter is optional. negativePrompt : Text string that describes anything you want to discourage the model from generating aspectRatio : Changes the aspect ratio of the generated video. \ No newline at end of file diff --git a/docstore/42f82f29-224c-4566-8665-94fac3316a33 b/docstore/42f82f29-224c-4566-8665-94fac3316a33 new file mode 100644 index 0000000000000000000000000000000000000000..1928fbda4690570381db2fc0734d5c40f27390c8 --- /dev/null +++ b/docstore/42f82f29-224c-4566-8665-94fac3316a33 @@ -0,0 +1 @@ +Part { InlineData : & genai . Blob { MIMEType : "audio/mp3" , Data : audioBytes , }, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } A few things to keep in mind about inline audio data: The maximum request size is 20 MB, which includes text prompts, system instructions, and files provided inline. If your file's size will make the total request size exceed 20 MB, then use the Files API to upload an audio file for use in the request. If you're using an audio sample multiple times, it's more efficient to upload an audio file . Get a transcript To get a transcript of audio data, just ask for it in the prompt: Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) prompt = 'Generate a transcript of the speech.' response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ prompt , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Generate a transcript of the speech." , ]), }); console . log ( "result.text=" , result . text ); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } localAudioPath := "/path/to/sample.mp3" uploadedFile , _ := client . Files . UploadFromPath ( ctx , localAudioPath , nil , ) parts := [] * genai . Part { genai . NewPartFromText ( "Generate a transcript of the speech." ), \ No newline at end of file diff --git a/docstore/42ff940b-ade6-4fdd-9d91-e26d668775bc b/docstore/42ff940b-ade6-4fdd-9d91-e26d668775bc new file mode 100644 index 0000000000000000000000000000000000000000..fcd49fd9d1e1bfa6316de012e7df2b50b5ffda8c --- /dev/null +++ b/docstore/42ff940b-ade6-4fdd-9d91-e26d668775bc @@ -0,0 +1 @@ +from_cached_content ( cached_content = apollo_cache ) response = apollo_model . generate_content ( "Find a lighthearted moment from this transcript" ) JavaScript import { GoogleAICacheManager , GoogleAIFileManager } from "@google/generative-ai/server" ; import { GoogleGenerativeAI } from "@google/generative-ai" ; const cacheManager = new GoogleAICacheManager ( "GOOGLE_API_KEY" ); const fileManager = new GoogleAIFileManager ( "GOOGLE_API_KEY" ); const uploadResult = await fileManager . uploadFile ( "path/to/a11.txt" , { mimeType : "text/plain" , }); const cacheResult = await cacheManager . create ({ model : "models/gemini-1.5-flash" , contents : [ { role : "user" , parts : [ { fileData : { fileUri : uploadResult . file . uri , mimeType : uploadResult . file . mimeType , }, }, ], }, ], }); console . log ( cacheResult ); const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModelFromCachedContent ( cacheResult ); const result = await model . generateContent ( "Please summarize this transcript." , ); console . log ( result . response . text ()); After Python import requests import pathlib from google import genai from google.genai import types client = genai . Client () # Check which models support caching. for m in client . models . list (): for action in m . supported_actions : if action == "createCachedContent" : print ( m . name ) break # Download file response = requests . get ( 'https://storage.googleapis.com/generativeai-downloads/data/a11.txt' ) pathlib . Path ( 'a11.txt' ) . write_text ( response . text ) # Upload file document = client . files . upload ( file = 'a11.txt' ) # Create cache model = 'gemini-1.5-flash-001' apollo_cache = client . caches . create ( model = model , config = { 'contents' : [ document ], 'system_instruction' : 'You are an expert at analyzing transcripts.' , }, ) # Generate response response = client . models . generate_content ( model = model , contents = 'Find a lighthearted moment from this \ No newline at end of file diff --git a/docstore/43068fdf-4e2d-498a-b925-706bcf4461a5 b/docstore/43068fdf-4e2d-498a-b925-706bcf4461a5 new file mode 100644 index 0000000000000000000000000000000000000000..8ae6e7f8faf81825a858a0fb6935f6e2a3e8dc09 --- /dev/null +++ b/docstore/43068fdf-4e2d-498a-b925-706bcf4461a5 @@ -0,0 +1 @@ +help with that, as I'm only a language model." If the model responds with a fallback response, try increasing the temperature. Things to avoid Avoid relying on models to generate factual information. Use with care on math and logic problems. Generative models under the hood This section aims to answer the question - Is there randomness in generative models' responses, or are they deterministic? The short answer - yes to both. When you prompt a generative model, a text response is generated in two stages. In the first stage, the generative model processes the input prompt and generates a probability distribution over possible tokens (words) that are likely to come next. For example, if you prompt with the input text "The dog jumped over the ... ", the generative model will produce an array of probable next words: [("fence", 0.77), ("ledge", 0.12), ("blanket", 0.03), ...] This process is deterministic; a generative model will produce this same distribution every time it's input the same prompt text. In the second stage, the generative model converts these distributions into actual text responses through one of several decoding strategies. A simple decoding strategy might select the most likely token at every timestep. This process would always be deterministic. However, you could instead choose to generate a response by randomly sampling over the distribution returned by the model. This process would be stochastic (random). Control the degree of randomness allowed in this decoding process by setting the temperature. A temperature of 0 means only the most likely tokens are selected, and there's no randomness. Conversely, a high temperature injects a high degree of randomness into the tokens selected by the model, leading to more unexpected, surprising model responses. Next steps Now that you have a deeper understanding of prompt design, try writing your own prompts using Google AI Studio . To learn about multimodal prompting, see Prompting with media files . To learn \ No newline at end of file diff --git a/docstore/431ad892-4740-4ecf-b3d9-1879d6bba3f8 b/docstore/431ad892-4740-4ecf-b3d9-1879d6bba3f8 new file mode 100644 index 0000000000000000000000000000000000000000..987408a9d5fce99ac4990dd87d63b0d860b9553d --- /dev/null +++ b/docstore/431ad892-4740-4ecf-b3d9-1879d6bba3f8 @@ -0,0 +1 @@ +overwhelming dominance of digital streaming and on-demand services; a "smart" approach would therefore require a highly specialized niche, a strong value proposition beyond just the disc, and likely a focus on community or collector markets rather than mainstream consumption. Here's a breakdown of what a smart DVD business in 2025 would likely entail: 1. Extreme Niche Specialization: You cannot compete with streaming libraries. Your business must focus on what digital doesn't offer easily or at all. This could include: * Out-of-Print & Rare Titles: Sourcing films or TV shows that have never been released digitally or are currently unavailable on major streaming platforms. .... (gemini-2.5-flash) Format responses with the completion strategy The completion strategy can also help format the response. The following example prompts the model to create an essay outline: Prompt: Create an outline for an essay about hummingbirds. Response: Here is a possible outline for an essay about hummingbirds: I. Introduction A. Hook: Start with a captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). B. Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds. C. Thesis Statement: State the main point of the essay, which could be about the remarkable adaptations, behaviors, and ecological importance of hummingbirds. II. Appearance and Identification ..... (gemini-2.5-flash) The prompt didn't specify the format for the outline and the model chose a format for you. To get the model to return an outline in a specific format, you can add text that represents the start of the outline and let the model complete it based on the pattern that you initiated. Prompt: Create an outline for an essay about hummingbirds. I. Introduction * Response: Okay, here is an outline for an essay about hummingbirds, starting from the introduction you provided: I. Introduction * Hook: Start with a \ No newline at end of file diff --git a/docstore/431b3ff1-8783-4734-bcf0-441982516c65 b/docstore/431b3ff1-8783-4734-bcf0-441982516c65 new file mode 100644 index 0000000000000000000000000000000000000000..eb8ec6e27c19d3df51e6200107f594f1d9dfe6a8 --- /dev/null +++ b/docstore/431b3ff1-8783-4734-bcf0-441982516c65 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/downloads#main-content Title: Gemini API libraries | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/431bb6c8-ec7d-4d6d-97ae-75e97da9de84 b/docstore/431bb6c8-ec7d-4d6d-97ae-75e97da9de84 new file mode 100644 index 0000000000000000000000000000000000000000..cd1a6469d32d4344455628e24b4f24d47cbf3ee6 --- /dev/null +++ b/docstore/431bb6c8-ec7d-4d6d-97ae-75e97da9de84 @@ -0,0 +1 @@ +new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "List a few popular cookie recipes, and include the amounts of ingredients." , config : { responseMimeType : "application/json" , responseSchema : { type : Type . ARRAY , items : { type : Type . OBJECT , properties : { recipeName : { type : Type . STRING , }, ingredients : { type : Type . ARRAY , items : { type : Type . STRING , }, }, }, propertyOrdering : [ "recipeName" , "ingredients" ], }, }, }, }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { ResponseMIMEType : "application/json" , ResponseSchema : & genai . Schema { Type : genai . TypeArray , Items : & genai . Schema { Type : genai . TypeObject , Properties : map [ string ] * genai . Schema { "recipeName" : { Type : genai . TypeString }, "ingredients" : { Type : genai . TypeArray , Items : & genai . Schema { Type : genai . TypeString }, }, }, PropertyOrdering : [] string { "recipeName" , "ingredients" }, }, }, } result , err := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "List a few popular cookie recipes, and include the amounts of ingredients." ), config , ) if err != nil { log . Fatal ( err ) } fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "contents": [{ "parts":[ { "text": "List a few popular cookie recipes, and include the amounts of ingredients." } ] }], "generationConfig": { "responseMimeType": "application/json", "responseSchema": { "type": "ARRAY", "items": { "type": "OBJECT", "properties": { "recipeName": { \ No newline at end of file diff --git a/docstore/43266e5e-5111-480e-9d31-8583965c40f6 b/docstore/43266e5e-5111-480e-9d31-8583965c40f6 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/43266e5e-5111-480e-9d31-8583965c40f6 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/433754ba-9905-4dd9-97f1-a46c0ffe15e3 b/docstore/433754ba-9905-4dd9-97f1-a46c0ffe15e3 new file mode 100644 index 0000000000000000000000000000000000000000..109bd0eaeef926ee02f7659dee842724a4ad9423 --- /dev/null +++ b/docstore/433754ba-9905-4dd9-97f1-a46c0ffe15e3 @@ -0,0 +1 @@ +the libraries are stable and fully supported for production use. They are actively maintained, provide access to the latest features, and offer the best performance working with Gemini. If you're not using the Google GenAI SDK and using one of our legacy libraries, we strongly recommend you to migrate. Review the legacy libraries section for more information. Legacy libraries and migration If you are using one of our legacy libraries, we recommend that you migrate to the new libraries . The legacy libraries don't provide access to recent features (such as Live API and Veo ) and are on a deprecation path. They will stop receiving updates at the end of September 2025, the feature gaps will grow and potential bugs may no longer get fixed. Each legacy library's support status varies, detailed in the following table: Language Legacy library Support status Recommended library Python google-generativeai All support, including bug fixes, ends end of September 2025. google-genai JavaScript/TypeScript @google/generativeai All support, including bug fixes, ends end of September 2025. @google/genai Go google.golang.org/generative-ai All support, including bug fixes, ends end of September 2025. google.golang.org/genai Dart and Flutter google_generative_ai Not actively maintained Use trusted community or third party libraries, like firebase_ai , or access using REST API Swift generative-ai-swift Not actively maintained Use Gemini in Firebase Android generative-ai-android Not actively maintained Use Gemini in Firebase Note for Java developers: There was no legacy Google-provided Java SDK for the Gemini API, so no migration from a previous Google library is required. You can start directly with the new library in the Language support and installation section. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google \ No newline at end of file diff --git a/docstore/4341ef31-9a99-44cd-a347-034cb10ad1bb b/docstore/4341ef31-9a99-44cd-a347-034cb10ad1bb new file mode 100644 index 0000000000000000000000000000000000000000..9b6431ae9c97257d5ff4628dd401f203e2f83eb3 --- /dev/null +++ b/docstore/4341ef31-9a99-44cd-a347-034cb10ad1bb @@ -0,0 +1 @@ +Use curl to send a POST request to the predictLongRunning endpoint. # The request body includes the prompt for video generation. curl " ${ BASE_URL } /models/veo-2.0-generate-001:predictLongRunning" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -X "POST" \ -d '{ "instances": [{ "prompt": "Panning wide shot of a calico kitten sleeping in the sunshine" } ], "parameters": { "aspectRatio": "16:9", "personGeneration": "dont_allow", } }' | tee result.json | jq .name | sed 's/"//g' > op_name # Obtain operation name to download video. op_name = $( cat op_name ) # Check against status of operation. while true ; do is_done = $( curl -H "x-goog-api-key: $GEMINI_API_KEY " " ${ BASE_URL } / ${ op_name } " | tee op_check.json | jq .done ) if [ " ${ is_done } " = "true" ] ; then cat op_check.json echo "** Attach API_KEY to download video, or examine error message." break fi echo "** Video ${ op_name } has not downloaded yet! Check again after 5 seconds..." # Wait for 5 seoncds to check again. sleep 5 done This code takes about 2-3 minutes to run, though it may take longer if resources are constrained. Once it's done running, you should see a video that looks something like this: If you see an error message instead of a video, this means that resources are constrained and your request couldn't be completed. In this case, run the code again. Generated videos are stored on the server for 2 days, after which they are removed. If you want to save a local copy of your generated video, you must run result() and save() within 2 days of generation. Generate from images You can also generate videos using images. The following code generates an image using Imagen, then uses the generated image as the starting frame for the generated video. First, generate an image using Imagen : Python prompt = "Panning wide shot of a calico kitten sleeping in the sunshine" , imagen = client . models . generate_images ( model = "imagen-3.0-generate-002" , prompt = prompt , \ No newline at end of file diff --git a/docstore/4346c5a4-8391-4a6d-b7be-ce49dda99738 b/docstore/4346c5a4-8391-4a6d-b7be-ce49dda99738 new file mode 100644 index 0000000000000000000000000000000000000000..465170b0f695eefaa56d7f3871f4f3df2424da62 --- /dev/null +++ b/docstore/4346c5a4-8391-4a6d-b7be-ce49dda99738 @@ -0,0 +1 @@ +brightness color_temp: Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. Returns: A dictionary containing the set brightness and color temperature. """ return { "brightness" : brightness , "colorTemperature" : color_temp } JavaScript import { Type } from '@google/genai' ; // Define a function that the model can call to control smart lights const setLightValuesFunctionDeclaration = { name : 'set_light_values' , description : 'Sets the brightness and color temperature of a light.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'Light level from 0 to 100. Zero is off and 100 is full brightness' , }, color_temp : { type : Type . STRING , enum : [ 'daylight' , 'cool' , 'warm' ], description : 'Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`.' , }, }, required : [ 'brightness' , 'color_temp' ], }, }; /** * Set the brightness and color temperature of a room light. (mock API) * @param {number} brightness - Light level from 0 to 100. Zero is off and 100 is full brightness * @param {string} color_temp - Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. * @return {Object} A dictionary containing the set brightness and color temperature. */ function setLightValues ( brightness , color_temp ) { return { brightness : brightness , colorTemperature : color_temp }; } Step 2: Call the model with function declarations Once you have defined your function declarations, you can prompt the model to use them. It analyzes the prompt and function declarations and decides whether to respond directly or to call a function. If a function is called, the response object will contain a function call suggestion. Python from google.genai import types # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ set_light_values_declaration ]) config = types . GenerateContentConfig ( tools = [ \ No newline at end of file diff --git a/docstore/435a00f4-3faf-4554-9a48-121f222bb054 b/docstore/435a00f4-3faf-4554-9a48-121f222bb054 new file mode 100644 index 0000000000000000000000000000000000000000..7ea9f8bcfd3306547cf34f86a441b3041d14b40c --- /dev/null +++ b/docstore/435a00f4-3faf-4554-9a48-121f222bb054 @@ -0,0 +1 @@ +Safety guidance | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Safety guidance Generative artificial intelligence models are powerful tools, but they are not without their limitations. Their versatility and applicability can sometimes lead to unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing, and rigorous manual evaluation are essential to limit the risk of harm from such outputs. The models provided by the Gemini API can be used for a wide variety of generative AI and natural language processing (NLP) applications. Use of these functions is only available through the Gemini API or the Google AI Studio web app. Your use of Gemini API is also subject to the Generative AI Prohibited Use Policy and the Gemini API terms of service . Part of what makes large language models (LLMs) so useful is that they're creative tools that can address many different language tasks. Unfortunately, this also means that large language models can generate output that you don't expect, including text that's offensive, insensitive, or factually incorrect. What's more, the incredible versatility of these models is also what makes it difficult to predict exactly what kinds of undesirable output they might produce. While the Gemini API has been designed with Google's AI principles in mind, the onus is on developers to apply these models responsibly. To aid developers in creating safe, responsible applications, the Gemini API has some built-in content filtering as well as adjustable safety settings across 4 dimensions of harm. Refer to the safety settings guide to learn more. This document is meant to introduce you \ No newline at end of file diff --git a/docstore/435bed5a-4a45-4385-b417-d9d0a8a84c86 b/docstore/435bed5a-4a45-4385-b417-d9d0a8a84c86 new file mode 100644 index 0000000000000000000000000000000000000000..448fb0d86f65fba6c1ebada31e49a4124dab31aa --- /dev/null +++ b/docstore/435bed5a-4a45-4385-b417-d9d0a8a84c86 @@ -0,0 +1 @@ +message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "16000.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . log ( "Transcription" ) console . log ( turn . serverContent . outputTranscription . text ); } } for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } else if ( turn . serverContent && turn . serverContent . inputTranscription ) { console . debug ( 'Received input transcription: %s\n' , turn . serverContent . inputTranscription . text ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Stream audio and video To see an example of how to use the Live API in a streaming audio and video format, run the "Live API - Get Started" file in the cookbooks repository: View on Colab Change voice and language The Live API models each support a different set of voices. Half-cascade supports Puck, Charon, Kore, Fenrir, Aoede, Leda, Orus, and Zephyr. Native audio supports a much longer list (identical to the TTS model list \ No newline at end of file diff --git a/docstore/435ce418-1a23-4b43-b495-a020f176fae7 b/docstore/435ce418-1a23-4b43-b495-a020f176fae7 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/435ce418-1a23-4b43-b495-a020f176fae7 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/4366704a-3d9c-4209-a246-3627d59b212f b/docstore/4366704a-3d9c-4209-a246-3627d59b212f new file mode 100644 index 0000000000000000000000000000000000000000..ca0d44e9961a5c85cb8aeb7b443a141c9784fb0d --- /dev/null +++ b/docstore/4366704a-3d9c-4209-a246-3627d59b212f @@ -0,0 +1 @@ +Image understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image understanding Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models. Tip: In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer improved accuracy for specific use cases like object detection and segmentation , through additional training. See the Capabilities section for more details. Passing images to Gemini You can provide images as input to Gemini using two methods: Passing inline image data : Ideal for smaller files (total request size less than 20MB, including prompts). Uploading images using the File API : Recommended for larger files or for reusing images across multiple requests. Passing inline image data You can pass inline image data in the request to generateContent . You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language). The following example shows how to read an image from a local file and pass it to generateContent API for processing. Python from google.genai import types with open ( 'path/to/small-sample.jpg' , 'rb' ) as f : image_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ types . Part . from_bytes ( data = image_bytes , mime_type = 'image/jpeg' , ), 'Caption this image.' ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs \ No newline at end of file diff --git a/docstore/438ccd31-4b6b-4ecf-bd69-b5f2fd1a2f2d b/docstore/438ccd31-4b6b-4ecf-bd69-b5f2fd1a2f2d new file mode 100644 index 0000000000000000000000000000000000000000..ca0d44e9961a5c85cb8aeb7b443a141c9784fb0d --- /dev/null +++ b/docstore/438ccd31-4b6b-4ecf-bd69-b5f2fd1a2f2d @@ -0,0 +1 @@ +Image understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image understanding Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models. Tip: In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer improved accuracy for specific use cases like object detection and segmentation , through additional training. See the Capabilities section for more details. Passing images to Gemini You can provide images as input to Gemini using two methods: Passing inline image data : Ideal for smaller files (total request size less than 20MB, including prompts). Uploading images using the File API : Recommended for larger files or for reusing images across multiple requests. Passing inline image data You can pass inline image data in the request to generateContent . You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language). The following example shows how to read an image from a local file and pass it to generateContent API for processing. Python from google.genai import types with open ( 'path/to/small-sample.jpg' , 'rb' ) as f : image_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ types . Part . from_bytes ( data = image_bytes , mime_type = 'image/jpeg' , ), 'Caption this image.' ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs \ No newline at end of file diff --git a/docstore/4392293a-10ce-4180-b34d-f19b0eb77c4a b/docstore/4392293a-10ce-4180-b34d-f19b0eb77c4a new file mode 100644 index 0000000000000000000000000000000000000000..1540f812946831d7975c774be202eaa83cf52504 --- /dev/null +++ b/docstore/4392293a-10ce-4180-b34d-f19b0eb77c4a @@ -0,0 +1 @@ +"CiwBVKhc7nRyTi3HmggPD9iQiRc261f5jwuMdw3H/itDH0emsb9ZVo3Nwx9p6wpsAVSoXO5i8fDV4jBSBLoaWxB5zUdlGY6aIGp+I0oEnwRRSRQ1LOvrDlojEH8JE8HjiKXALdJrvNPiG+HY3GZEO8pZjEZtc3UoBUh7+SVyjK7Xolu7aRYYeUyzrCapoETWypER1jbrJXnFV23hCosBAVSoXO6oIPNJSmbuEDfGafOhuCSHkpr1yjTp35RXYqmCESzRzWf5+nFXLqncqeFo4ohoxbiYQVpVQbOZF81p8o9zg6xeRE7qMeOv+XN7enXGJ4/s3qNFQpfkSMqRdBITN1VpX7jyfEAjvxBNc7PDfDJZmEPY338ZIY5nFFcmzJSWjVrboFt2sMFv+A==" } ] , "role" : "model" } , "finishReason" : "STOP" , "index" : 0 } ] , # Remainder of response... You can confirm that you received a signature and see what a signature looks like using the following code: # Step 2: Call the model with function declarations # ...Generation config, Configure the client, and Define user prompt (No changes) # Send request with declarations (using a thinking model) response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents ) # See thought signatures for part in response . candidates [ 0 ] . content . parts : if part . thought_signature : print ( "Thought signature:" ) print ( part . thought_signature ) Returning signatures back to the server In order to return signatures back: You should return signatures along with their containing parts back to the server You shouldn't merge a part with a signature with another part which also contains a signature. The signature string is not concatenable You shouldn't merge one part with a signature with another part without a signature. This breaks the correct positioning of the thought represented by the signature. The code will remain the same as in Step 4 of the previous section. But in this case (as indicated in the comment below) you will return signatures to the model along with the result of the function execution so the model can incorporate the thoughts into its final response: Python # Step 4: Create user friendly response with function result and call the model again # ...Create a function response part (No change) # Append thought \ No newline at end of file diff --git a/docstore/43a7d822-dc67-4b19-bab2-5bc1301e1f59 b/docstore/43a7d822-dc67-4b19-bab2-5bc1301e1f59 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/43a7d822-dc67-4b19-bab2-5bc1301e1f59 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/43ad1473-5564-41eb-824a-5f343c3b3636 b/docstore/43ad1473-5564-41eb-824a-5f343c3b3636 new file mode 100644 index 0000000000000000000000000000000000000000..0f1e2cf35fbba506f3a6545a63a1d18c57ccb9a5 --- /dev/null +++ b/docstore/43ad1473-5564-41eb-824a-5f343c3b3636 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-2.5-pro Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/43b3022c-950a-4db0-ab02-b8017a2dd96d b/docstore/43b3022c-950a-4db0-ab02-b8017a2dd96d new file mode 100644 index 0000000000000000000000000000000000000000..0d6f2bf32b918d3f85636fdb1c53070e3d5509f6 --- /dev/null +++ b/docstore/43b3022c-950a-4db0-ab02-b8017a2dd96d @@ -0,0 +1 @@ +overlay_filename = f " { item [ 'label' ] } _ { i } _overlay.png" mask . save ( os . path . join ( output_dir , mask_filename )) # Create and save overlay composite = Image . alpha_composite ( im . convert ( 'RGBA' ), overlay ) composite . save ( os . path . join ( output_dir , overlay_filename )) print ( f "Saved mask and overlay for { item [ 'label' ] } to { output_dir } " ) # Example usage if __name__ == "__main__" : extract_segmentation_masks ( "path/to/image.png" ) Check the segmentation example in the cookbook guide for a more detailed example. An example segmentation output with objects and segmentation masks Supported image formats Gemini supports the following image format MIME types: PNG - image/png JPEG - image/jpeg WEBP - image/webp HEIC - image/heic HEIF - image/heif Capabilities All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation. Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements. Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities: Gemini 2.0 models are further trained to support enhanced object detection . Gemini 2.5 models are further trained to support enhanced segmentation in addition to object detection . Limitations and key technical information File limit Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request. Token calculation Gemini 1.5 Flash and Gemini 1.5 Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens. Gemini 2.0 Flash and Gemini 2.5 Flash/Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each \ No newline at end of file diff --git a/docstore/43ccbfcf-fd66-4825-9435-c1974d484a66 b/docstore/43ccbfcf-fd66-4825-9435-c1974d484a66 new file mode 100644 index 0000000000000000000000000000000000000000..5ec9ece3755d9588328841497099189b5d8c6b8d --- /dev/null +++ b/docstore/43ccbfcf-fd66-4825-9435-c1974d484a66 @@ -0,0 +1 @@ +5. Forever Blooms Task Give me a simple list of just the things that I must bring on a camping trip. The list should have 5 items. 1. **Tent:** Your shelter from the elements. 2. **Sleeping Bag:** Stay warm and comfortable. 3. **Sleeping Pad:** Insulation and cushioning for a good night's sleep. 4. **Headlamp or Flashlight:** Light for nighttime activities. 5. **First Aid Kit:** Be prepared for minor injuries. Entity Classify the following items as [large, small]: Elephant Mouse Snail Here's the classification: * **Elephant:** Large * **Mouse:** Small * **Snail:** Small Partial input completion Generative language models work like an advanced auto completion tool. When you provide partial content, the model can provide the rest of the content or what it thinks is a continuation of that content as a response. When doing so, if you include any examples or context, the model can take those examples or context into account. The following example provides a prompt with an instruction and an entity input: Prompt: For the given order, return a JSON object that has the fields cheeseburger, hamburger, fries, or drink, with the value being the quantity. Order: A burger and a drink. Response: { "cheeseburger": 0, "hamburger": 1, "fries": 0, "drink": 1 } (gemini-2.5-flash) While the model did as prompted, writing out the instructions in natural language can sometimes be challenging and it leaves a lot to the model's interpretation. For example, a restaurants menu might contain many items. To reduce the size of the JSON response, you probably want to omit the items that weren't ordered. In this case, you can give an example and a response prefix and let the model complete it: Prompt: Valid fields are cheeseburger, hamburger, fries, and drink. Order: Give me a cheeseburger and fries Output: ``` { "cheeseburger": 1, "fries": 1 } ``` Order: I want two burgers, a drink, and fries. Output: Response: ``` { "hamburger": 2, "drink": 1, "fries": 1 } ``` (gemini-2.5-flash) Notice how \ No newline at end of file diff --git a/docstore/4404f4b0-7ee9-4474-a6f9-e1db3a531532 b/docstore/4404f4b0-7ee9-4474-a6f9-e1db3a531532 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/4404f4b0-7ee9-4474-a6f9-e1db3a531532 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/4406c5a5-d6af-46f1-90d2-5058b2a00d39 b/docstore/4406c5a5-d6af-46f1-90d2-5058b2a00d39 new file mode 100644 index 0000000000000000000000000000000000000000..a3fd9d3225fb67d0660508c87d747294298e3c33 --- /dev/null +++ b/docstore/4406c5a5-d6af-46f1-90d2-5058b2a00d39 @@ -0,0 +1 @@ +{ "uri" : "https://vertexaisearch.cloud.google.com....." , "title" : "uefa.com" }} ], "groundingSupports" : [ { "segment" : { "startIndex" : 0 , "endIndex" : 85 , "text" : "Spain won Euro 2024, defeatin..." }, "groundingChunkIndices" : [ 0 ] }, { "segment" : { "startIndex" : 86 , "endIndex" : 210 , "text" : "This victory marks Spain's..." }, "groundingChunkIndices" : [ 0 , 1 ] } ] } } ] } The Gemini API returns the following information with the groundingMetadata : webSearchQueries : Array of the search queries used. This is useful for debugging and understanding the model's reasoning process. searchEntryPoint : Contains the HTML and CSS to render the required Search Suggestions. Full usage requirements are detailed in the Terms of Service . groundingChunks : Array of objects containing the web sources ( uri and title ). groundingSupports : Array of chunks to connect model response text to the sources in groundingChunks . Each chunk links a text segment (defined by startIndex and endIndex ) to one or more groundingChunkIndices . This is the key to building inline citations. Grounding with Google Search can also be used in combination with the URL context tool to ground responses in both public web data and the specific URLs you provide. Attributing Sources with inline Citations The API returns structured citation data, giving you complete control over how you display sources in your user interface. You can use the groundingSupports and groundingChunks fields to link the model's statements directly to their sources. Here is a common pattern for processing the metadata to create a response with inline, clickable citations. Python def add_citations ( response ): text = response . text supports = response . candidates [ 0 ] . grounding_metadata . grounding_supports chunks = response . candidates [ 0 ] . grounding_metadata . grounding_chunks # Sort supports by end_index in descending order to avoid shifting issues when inserting. sorted_supports = sorted ( supports , key \ No newline at end of file diff --git a/docstore/4409ed22-8ae3-4ceb-9c2e-bd12feb9580c b/docstore/4409ed22-8ae3-4ceb-9c2e-bd12feb9580c new file mode 100644 index 0000000000000000000000000000000000000000..42fbfa8d3a1b9c27b4f54909cff17ace224a9de6 --- /dev/null +++ b/docstore/4409ed22-8ae3-4ceb-9c2e-bd12feb9580c @@ -0,0 +1 @@ +over a happy ' 'futuristic scifi city with lots of greenery?' ) response = client . models . generate_content ( model = "gemini-2.0-flash-preview-image-generation" , contents = contents , config = types . GenerateContentConfig ( response_modalities = [ 'TEXT' , 'IMAGE' ] ) ) for part in response . candidates [ 0 ] . content . parts : if part . text is not None : print ( part . text ) elif part . inline_data is not None : image = Image . open ( BytesIO (( part . inline_data . data ))) image . save ( 'gemini-native-image.png' ) image . show () JavaScript Note: We've released the Google SDK for TypeScript and JavaScript in preview launch stage . Use this SDK for image generation features. import { GoogleGenAI , Modality } from "@google/genai" ; import * as fs from "node:fs" ; async function main () { const ai = new GoogleGenAI ({}); const contents = "Hi, can you create a 3d rendered image of a pig " + "with wings and a top hat flying over a happy " + "futuristic scifi city with lots of greenery?" ; // Set responseModalities to include "Image" so the model can generate an image const response = await ai . models . generateContent ({ model : "gemini-2.0-flash-preview-image-generation" , contents : contents , config : { responseModalities : [ Modality . TEXT , Modality . IMAGE ], }, }); for ( const part of response . candidates [ 0 ]. content . parts ) { // Based on the part type, either show the text or save the image if ( part . text ) { console . log ( part . text ); } else if ( part . inlineData ) { const imageData = part . inlineData . data ; const buffer = Buffer . from ( imageData , "base64" ); fs . writeFileSync ( "gemini-native-image.png" , buffer ); console . log ( "Image saved as gemini-native-image.png" ); } } } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . \ No newline at end of file diff --git a/docstore/441d113b-5f27-47ab-bab8-ca9e77c848bb b/docstore/441d113b-5f27-47ab-bab8-ca9e77c848bb new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/441d113b-5f27-47ab-bab8-ca9e77c848bb @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/44253a7d-bca2-4993-a2c7-61fd5e12c891 b/docstore/44253a7d-bca2-4993-a2c7-61fd5e12c891 new file mode 100644 index 0000000000000000000000000000000000000000..6ed0306a7036cb374be3f59365e1052324311f95 --- /dev/null +++ b/docstore/44253a7d-bca2-4993-a2c7-61fd5e12c891 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/text-generation#main-content Title: Text generation | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/4440ce61-fc9b-45ec-a025-4e3d81d8c8de b/docstore/4440ce61-fc9b-45ec-a025-4e3d81d8c8de new file mode 100644 index 0000000000000000000000000000000000000000..47c113cd9bacf84a434d531b6a240f0025b74130 --- /dev/null +++ b/docstore/4440ce61-fc9b-45ec-a025-4e3d81d8c8de @@ -0,0 +1 @@ +. For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC. \ No newline at end of file diff --git a/docstore/444fb2cb-68bb-4eae-82f3-5b8fa995e526 b/docstore/444fb2cb-68bb-4eae-82f3-5b8fa995e526 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/444fb2cb-68bb-4eae-82f3-5b8fa995e526 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/445598c6-a00f-4deb-9846-521b451eff1b b/docstore/445598c6-a00f-4deb-9846-521b451eff1b new file mode 100644 index 0000000000000000000000000000000000000000..44112a5590578bd0d81bd7d1053c465e40ca11dd --- /dev/null +++ b/docstore/445598c6-a00f-4deb-9846-521b451eff1b @@ -0,0 +1 @@ +tokens for other Live API models Supported languages Live API supports the following languages. Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Language BCP-47 Code Language BCP-47 Code German (Germany) de-DE English (Australia)* en-AU English (UK)* en-GB English (India) en-IN English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Portuguese (Brazil) pt-BR Arabic (Generic) ar-XA Spanish (Spain)* es-ES French (Canada)* fr-CA Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Bengali (India) bn-IN Gujarati (India)* gu-IN Kannada (India)* kn-IN Marathi (India) mr-IN Malayalam (India)* ml-IN Tamil (India) ta-IN Telugu (India) te-IN Dutch (Netherlands) nl-NL Korean (South Korea) ko-KR Mandarin Chinese (China)* cmn-CN Polish (Poland) pl-PL Russian (Russia) ru-RU Thai (Thailand) th-TH Languages marked with an asterisk (*) are not available for Native audio . What's next Read the Tool Use and Session Management guides for essential information on using the Live API effectively. Try the Live API in Google AI Studio . For more info about the Live API models, see Gemini 2.0 Flash Live and Gemini 2.5 Flash Native Audio on the Models page. Try more examples in the Live API cookbook , the Live API Tools cookbook , and the Live API Get Started script . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/44599d99-1911-4bbe-9b08-80e2d8d994c3 b/docstore/44599d99-1911-4bbe-9b08-80e2d8d994c3 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/44599d99-1911-4bbe-9b08-80e2d8d994c3 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/4466aec5-d268-4a64-bc15-2b7cb90e5f9d b/docstore/4466aec5-d268-4a64-bc15-2b7cb90e5f9d new file mode 100644 index 0000000000000000000000000000000000000000..c839e4b299fa83f191461c51a3897f429d1b3fab --- /dev/null +++ b/docstore/4466aec5-d268-4a64-bc15-2b7cb90e5f9d @@ -0,0 +1 @@ +Response: {'status': 'success'}" ); return { status : "success" }; } const toolFunctions = { get_weather_forecast , set_thermostat_temperature , }; const tools = [ { functionDeclarations : [ { name : "get_weather_forecast" , description : "Gets the current weather temperature for a given location." , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , }, }, required : [ "location" ], }, }, { name : "set_thermostat_temperature" , description : "Sets the thermostat to a desired temperature." , parameters : { type : Type . OBJECT , properties : { temperature : { type : Type . NUMBER , }, }, required : [ "temperature" ], }, }, ], }, ]; // Prompt for the model let contents = [ { role : "user" , parts : [ { text : "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , }, ], }, ]; // Loop until the model has no more function calls to make while ( true ) { const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents , config : { tools }, }); if ( result . functionCalls && result . functionCalls . length > 0 ) { const functionCall = result . functionCalls [ 0 ]; const { name , args } = functionCall ; if ( ! toolFunctions [ name ]) { throw new Error ( `Unknown function call: ${ name } ` ); } // Call the function and get the response. const toolResponse = toolFunctions [ name ]( args ); const functionResponsePart = { name : functionCall . name , response : { result : toolResponse , }, }; // Send the function response back to the model. contents . push ({ role : "model" , parts : [ { functionCall : functionCall , }, ], }); contents . push ({ role : "user" , parts : [ { functionResponse : functionResponsePart , }, ], }); } else { // No more function calls, break the loop. console . log ( result . text ); break ; } } Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the \ No newline at end of file diff --git a/docstore/44755693-fb55-4634-af03-f7406149eebe b/docstore/44755693-fb55-4634-af03-f7406149eebe new file mode 100644 index 0000000000000000000000000000000000000000..3d93e98c2388d03b02165eba8a61d6ed193d8415 --- /dev/null +++ b/docstore/44755693-fb55-4634-af03-f7406149eebe @@ -0,0 +1 @@ +Prompt design strategies | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Prompt design strategies Prompt design is the process of creating prompts, or natural language requests, that elicit accurate, high quality responses from a language model. This page introduces basic concepts, strategies, and best practices to get you started designing prompts to get the most out of Gemini AI models. Topic-specific prompt guides Looking for more specific prompt strategies? Check out our other prompting guides on: Prompting with media files Prompting for image generation Prompting for video generation Google AI Studio also hosts a prompt gallery meant to interactively showcase many of the concepts shared in this guide. Clear and specific instructions An effective and efficient way to customize model behavior is to provide it with clear and specific instructions. Instructions can be in the form of a question, step-by-step tasks, or as complex as mapping out a user's experience and mindset. Input Input is the required text in the prompt that you want the model to provide a response to. Inputs can be a question that the model answers (question input), a task the model performs (task input), an entity the model operates on (entity input), or partial input that the model completes or continues (completion input). Input type Prompt Generated output Question What's a good name for a flower shop that specializes in selling bouquets of dried flowers? Create a list of 5 options with just the names. Here are 10 names for a flower shop specializing in dried flowers: 1. Everlasting Blooms 2. Dried & Delightful 3. The Petal Preserve 4. Whispers of Wildflowers \ No newline at end of file diff --git a/docstore/44782ddb-bf39-4e5c-84fb-f5c7408c51e6 b/docstore/44782ddb-bf39-4e5c-84fb-f5c7408c51e6 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/44782ddb-bf39-4e5c-84fb-f5c7408c51e6 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/4479112b-bc11-4430-8be5-10e386176644 b/docstore/4479112b-bc11-4430-8be5-10e386176644 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/4479112b-bc11-4430-8be5-10e386176644 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/447ca5fc-bd42-402f-9717-ff781872892f b/docstore/447ca5fc-bd42-402f-9717-ff781872892f new file mode 100644 index 0000000000000000000000000000000000000000..5a2b8a2c7253b87796a50aae4616794fa40b7018 --- /dev/null +++ b/docstore/447ca5fc-bd42-402f-9717-ff781872892f @@ -0,0 +1 @@ +( 'Opened' ); }, onmessage : function ( message ) { console . debug ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send content... session . close (); } main (); Note: You can only set one modality in the response_modalities field. This means that you can configure the model to respond with either text or audio, but not both in the same session. Interaction modalities The following sections provide examples and supporting context for the different input and output modalities available in Live API. Sending and receiving text Here's how you can send and receive text: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello, how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . text is not None : print ( response . text , end = "" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . \ No newline at end of file diff --git a/docstore/44872b63-3808-45e4-ae91-d20470c2f476 b/docstore/44872b63-3808-45e4-ae91-d20470c2f476 new file mode 100644 index 0000000000000000000000000000000000000000..c19e2d1f2a8624bd1e7529d33c658d3154e40942 --- /dev/null +++ b/docstore/44872b63-3808-45e4-ae91-d20470c2f476 @@ -0,0 +1 @@ +response. A token is approximately four characters. 100 tokens correspond to roughly 60-80 words. Temperature: The temperature controls the degree of randomness in token selection. The temperature is used for sampling during response generation, which occurs when topP and topK are applied. Lower temperatures are good for prompts that require a more deterministic or less open-ended response, while higher temperatures can lead to more diverse or creative results. A temperature of 0 is deterministic, meaning that the highest probability response is always selected. topK : The topK parameter changes how the model selects tokens for output. A topK of 1 means the selected token is the most probable among all the tokens in the model's vocabulary (also called greedy decoding), while a topK of 3 means that the next token is selected from among the 3 most probable using the temperature. For each token selection step, the topK tokens with the highest probabilities are sampled. Tokens are then further filtered based on topP with the final token selected using temperature sampling. topP : The topP parameter changes how the model selects tokens for output. Tokens are selected from the most to least probable until the sum of their probabilities equals the topP value. For example, if tokens A, B, and C have a probability of 0.3, 0.2, and 0.1 and the topP value is 0.5, then the model will select either A or B as the next token by using the temperature and exclude C as a candidate. The default topP value is 0.95. stop_sequences : Set a stop sequence to tell the model to stop generating content. A stop sequence can be any sequence of characters. Try to avoid using a sequence of characters that may appear in the generated content. Prompt iteration strategies Prompt design can sometimes require a few iterations before you consistently get the response you're looking for. This section provides guidance on some things you can try when iterating on your prompts: Use different phrasing: \ No newline at end of file diff --git a/docstore/4487c846-703c-4d76-9ae9-a8a22035ec6a b/docstore/4487c846-703c-4d76-9ae9-a8a22035ec6a new file mode 100644 index 0000000000000000000000000000000000000000..433635003046509e85b7917fbaa1cad75744aec9 --- /dev/null +++ b/docstore/4487c846-703c-4d76-9ae9-a8a22035ec6a @@ -0,0 +1 @@ +GenerateContentRequest inline_requests = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Tell me a one-sentence joke.' }], 'role' : 'user' }] }, { 'contents' : [{ 'parts' : [{ 'text' : 'Why is the sky blue?' }], 'role' : 'user' }] } ] inline_batch_job = client . batches . create ( model = "models/gemini-2.5-flash" , src = inline_requests , config = { 'display_name' : "inlined-requests-job-1" , }, ) print ( f "Created batch job: { inline_batch_job . name } " ) REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:batchGenerateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -X POST \ -H "Content-Type:application/json" \ -d '{ "batch": { "display_name": "my-batch-requests", "input_config": { "requests": { "requests": [ { "request": {"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}]}, "metadata": { "key": "request-1" } }, { "request": {"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}]}, "metadata": { "key": "request-2" } } ] } } } }' You can use any requests you would use in non-batch (or interactive) mode. For example, you could specify the temperature, system instructions or even pass in other modalities. The following example shows some example inline requests that contain a system instruction for one of the requests: inline_requests_list = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Write a short poem about a cloud.' }]}]}, { 'contents' : [{ 'parts' : [{ 'text' : 'Write a short poem about a cat.' }]}], 'system_instructions' : { 'parts' : [{ 'text' : 'You are a cat. Your name is Neko.' }]}} ] Similarly can also specify tools to use for a request. The following example shows a request that enables the Google Search tool : inline_requests_list = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Who won the euro 1998?' }]}]}, { 'contents' : [{ 'parts' : [{ 'text' : 'Who won the euro 2025?' }]}], 'tools' : [{ 'google_search ' : {}}]} ] Input file For larger sets of requests, prepare a JSON Lines \ No newline at end of file diff --git a/docstore/4487faf2-223d-4d4b-a101-8c4a9711cede b/docstore/4487faf2-223d-4d4b-a101-8c4a9711cede new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/4487faf2-223d-4d4b-a101-8c4a9711cede @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/4492657e-f60b-4ce2-b4f8-0bcd43099546 b/docstore/4492657e-f60b-4ce2-b4f8-0bcd43099546 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/4492657e-f60b-4ce2-b4f8-0bcd43099546 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/44cc79c4-67f7-4228-8f86-086c5bd271aa b/docstore/44cc79c4-67f7-4228-8f86-086c5bd271aa new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/44cc79c4-67f7-4228-8f86-086c5bd271aa @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/44ceffe6-ae44-484c-82ea-fb27e6007141 b/docstore/44ceffe6-ae44-484c-82ea-fb27e6007141 new file mode 100644 index 0000000000000000000000000000000000000000..203c1e44cd8be8f0144d13a11f43fb822930ab15 --- /dev/null +++ b/docstore/44ceffe6-ae44-484c-82ea-fb27e6007141 @@ -0,0 +1 @@ +] // MCP Server }); const client = new Client ( { name : "example-client" , version : "1.0.0" } ); // Configure the client const ai = new GoogleGenAI ({}); // Initialize the connection between client and server await client . connect ( serverParams ); // Send request to the model with MCP tools const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : `What is the weather in London in ${ new Date (). toLocaleDateString () } ?` , config : { tools : [ mcpToTool ( client )], // uses the session, will automatically call the tool // Uncomment if you **don't** want the sdk to automatically call the tool // automaticFunctionCalling: { // disable: true, // }, }, }); console . log ( response . text ) // Close the connection await client . close (); Limitations with built-in MCP support Built-in MCP support is a experimental feature in our SDKs and has the following limitations: Only tools are supported, not resources nor prompts It is available for the Python and JavaScript/TypeScript SDK. Breaking changes might occur in future releases. Manual integration of MCP servers is always an option if these limit what you're building. Supported models This section lists models and their function calling capabilities. Experimental models are not included. You can find a comprehensive capabilities overview on the model overview page. Model Function Calling Parallel Function Calling Compositional Function Calling Gemini 2.5 Pro ✔️ ✔️ ✔️ Gemini 2.5 Flash ✔️ ✔️ ✔️ Gemini 2.5 Flash-Lite ✔️ ✔️ ✔️ Gemini 2.0 Flash ✔️ ✔️ ✔️ Gemini 2.0 Flash-Lite X X X Best practices Function and Parameter Descriptions: Be extremely clear and specific in your descriptions. The model relies on these to choose the correct function and provide appropriate arguments. Naming: Use descriptive function names (without spaces, periods, or dashes). Strong Typing: Use specific types (integer, string, enum) for parameters to reduce errors. If a parameter has a limited set of valid \ No newline at end of file diff --git a/docstore/44d53912-ca83-4b53-aa06-52a57ac8eff9 b/docstore/44d53912-ca83-4b53-aa06-52a57ac8eff9 new file mode 100644 index 0000000000000000000000000000000000000000..5f25eb2a53a9afab2cc27675039b1ff3f0e2b594 --- /dev/null +++ b/docstore/44d53912-ca83-4b53-aa06-52a57ac8eff9 @@ -0,0 +1 @@ +suitable for production use. Review ephemeral tokens guide for more information. Consider adding restrictions to your key: You can limit a key's permissions by adding API key restrictions . This minimizes the potential damage if the key is ever leaked. For some general best practices, you can also review this support article . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/44edbda7-f869-4c76-a991-c85adf56dfb0 b/docstore/44edbda7-f869-4c76-a991-c85adf56dfb0 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/44edbda7-f869-4c76-a991-c85adf56dfb0 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/44fb0da2-0117-4b61-8913-84bf42a97c78 b/docstore/44fb0da2-0117-4b61-8913-84bf42a97c78 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/44fb0da2-0117-4b61-8913-84bf42a97c78 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/45081c25-b9dd-48d9-b244-5a00c852a3ba b/docstore/45081c25-b9dd-48d9-b244-5a00c852a3ba new file mode 100644 index 0000000000000000000000000000000000000000..9e3ef9ab4fdb39828d1be4d7038f9dd43cffdc73 --- /dev/null +++ b/docstore/45081c25-b9dd-48d9-b244-5a00c852a3ba @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/openai#thinking Title: OpenAI compatibility | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/452b958a-7668-4b40-8652-694e9bf45c22 b/docstore/452b958a-7668-4b40-8652-694e9bf45c22 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/452b958a-7668-4b40-8652-694e9bf45c22 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/4532b4dc-f41c-4680-bd64-0f3b79c4ff09 b/docstore/4532b4dc-f41c-4680-bd64-0f3b79c4ff09 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/4532b4dc-f41c-4680-bd64-0f3b79c4ff09 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/4551213c-9ee6-43ac-b90c-afbaa615fdea b/docstore/4551213c-9ee6-43ac-b90c-afbaa615fdea new file mode 100644 index 0000000000000000000000000000000000000000..002d74f0081d5a1754ecf09d829f2e05938acd13 --- /dev/null +++ b/docstore/4551213c-9ee6-43ac-b90c-afbaa615fdea @@ -0,0 +1 @@ +turn_on_the_lights , turn_off_the_lights ] } ] const config = { responseModalities : [ Modality . TEXT ], tools : tools } // ... remaining model call What's next Check out more examples of using tools with the Live API in the Tool use cookbook . Get the full story on features and configurations from the Live API Capabilities guide . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/45579448-57cf-4dca-8a62-1daa17368bf5 b/docstore/45579448-57cf-4dca-8a62-1daa17368bf5 new file mode 100644 index 0000000000000000000000000000000000000000..42fa1181f59c2770e01e6d989f77a4fc2017b457 --- /dev/null +++ b/docstore/45579448-57cf-4dca-8a62-1daa17368bf5 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/code-execution Title: Code execution | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/45756d2b-1eda-465e-a772-d4eb79b5195f b/docstore/45756d2b-1eda-465e-a772-d4eb79b5195f new file mode 100644 index 0000000000000000000000000000000000000000..109bd0eaeef926ee02f7659dee842724a4ad9423 --- /dev/null +++ b/docstore/45756d2b-1eda-465e-a772-d4eb79b5195f @@ -0,0 +1 @@ +the libraries are stable and fully supported for production use. They are actively maintained, provide access to the latest features, and offer the best performance working with Gemini. If you're not using the Google GenAI SDK and using one of our legacy libraries, we strongly recommend you to migrate. Review the legacy libraries section for more information. Legacy libraries and migration If you are using one of our legacy libraries, we recommend that you migrate to the new libraries . The legacy libraries don't provide access to recent features (such as Live API and Veo ) and are on a deprecation path. They will stop receiving updates at the end of September 2025, the feature gaps will grow and potential bugs may no longer get fixed. Each legacy library's support status varies, detailed in the following table: Language Legacy library Support status Recommended library Python google-generativeai All support, including bug fixes, ends end of September 2025. google-genai JavaScript/TypeScript @google/generativeai All support, including bug fixes, ends end of September 2025. @google/genai Go google.golang.org/generative-ai All support, including bug fixes, ends end of September 2025. google.golang.org/genai Dart and Flutter google_generative_ai Not actively maintained Use trusted community or third party libraries, like firebase_ai , or access using REST API Swift generative-ai-swift Not actively maintained Use Gemini in Firebase Android generative-ai-android Not actively maintained Use Gemini in Firebase Note for Java developers: There was no legacy Google-provided Java SDK for the Gemini API, so no migration from a previous Google library is required. You can start directly with the new library in the Language support and installation section. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google \ No newline at end of file diff --git a/docstore/458ed89d-7bc2-443f-ab38-1b64f844c88b b/docstore/458ed89d-7bc2-443f-ab38-1b64f844c88b new file mode 100644 index 0000000000000000000000000000000000000000..0d39ec227c9e73c3d3b5117917b80f593b500b4b --- /dev/null +++ b/docstore/458ed89d-7bc2-443f-ab38-1b64f844c88b @@ -0,0 +1 @@ +async function main () { const ai = new GoogleGenAI ({}); const imageUrl = "https://goo.gle/instrument-img" ; const response = await fetch ( imageUrl ); const imageArrayBuffer = await response . arrayBuffer (); const base64ImageData = Buffer . from ( imageArrayBuffer ). toString ( 'base64' ); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ { inlineData : { mimeType : 'image/jpeg' , data : base64ImageData , }, }, { text : "Caption this image." } ], }); console . log ( result . text ); } main (); Go package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Download the image. imageResp , _ := http . Get ( "https://goo.gle/instrument-img" ) imageBytes , _ := io . ReadAll ( imageResp . Body ) parts := [] * genai . Part { genai . NewPartFromBytes ( imageBytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMG_URL = "https://goo.gle/instrument-img" MIME_TYPE = $( curl -sIL " $IMG_URL " | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1 ) if [[ -z " $MIME_TYPE " || ! " $MIME_TYPE " == image/* ]] ; then MIME_TYPE = "image/jpeg" fi # Check for macOS if [[ " $( uname ) " == "Darwin" ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -b 0 ) elif [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 ) else IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -w0 ) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ \ No newline at end of file diff --git a/docstore/45928ff7-329e-467e-aa9a-72307810bc0c b/docstore/45928ff7-329e-467e-aa9a-72307810bc0c new file mode 100644 index 0000000000000000000000000000000000000000..48ce7760ed3b3e078bbb96293e0e67132c5a10c7 --- /dev/null +++ b/docstore/45928ff7-329e-467e-aa9a-72307810bc0c @@ -0,0 +1 @@ +Video understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Video understanding Gemini models can process videos, enabling many frontier developer use cases that would have historically required domain specific models. Some of Gemini's vision capabilities include the ability to: Describe, segment, and extract information from videos Answer questions about video content Refer to specific timestamps within a video Gemini was built to be multimodal from the ground up and we continue to push the frontier of what is possible. This guide shows how to use the Gemini API to generate text responses based on video inputs. Video input You can provide videos as input to Gemini in the following ways: Upload a video file using the File API before making a request to generateContent . Use this method for files larger than 20MB, videos longer than approximately 1 minute, or when you want to reuse the file across multiple requests. Pass inline video data with the request to generateContent . Use this method for smaller files (<20MB) and shorter durations. Include a YouTube URL directly in the prompt. Upload a video file You can use the Files API to upload a video file. Always use the Files API when the total request size (including the file, text prompt, system instructions, etc.) is larger than 20 MB, the video duration is significant, or if you intend to use the same video in multiple prompts. The File API accepts video file formats directly. This example uses the short NASA film "Jupiter's Great Red Spot Shrinks and Grows" . Credit: Goddard Space Flight Center (GSFC)/David Ladd (2018). "Jupiter's Great Red Spot Shrinks and Grows" is in the \ No newline at end of file diff --git a/docstore/45b6524b-d745-478f-b4c1-df3e8035c95d b/docstore/45b6524b-d745-478f-b4c1-df3e8035c95d new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/45b6524b-d745-478f-b4c1-df3e8035c95d @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/45ba16fc-65a9-470a-a011-7b922022f8f6 b/docstore/45ba16fc-65a9-470a-a011-7b922022f8f6 new file mode 100644 index 0000000000000000000000000000000000000000..2437f77cb02a7dfc3b66d950f0fe4ad8777ea66f --- /dev/null +++ b/docstore/45ba16fc-65a9-470a-a011-7b922022f8f6 @@ -0,0 +1 @@ +SpeakerVoiceConfig ( speaker = 'Joe' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), types . SpeakerVoiceConfig ( speaker = 'Jane' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Puck' , ) ) ), ] ) ) ) ) data = response . candidates [ 0 ] . content . parts [ 0 ] . inline_data . data file_name = 'out.wav' wave_file ( file_name , data ) # Saves the file to current directory JavaScript import { GoogleGenAI } from '@google/genai' ; import wav from 'wav' ; async function saveWaveFile ( filename , pcmData , channels = 1 , rate = 24000 , sampleWidth = 2 , ) { return new Promise (( resolve , reject ) = > { const writer = new wav . FileWriter ( filename , { channels , sampleRate : rate , bitDepth : sampleWidth * 8 , }); writer . on ( 'finish' , resolve ); writer . on ( 'error' , reject ); writer . write ( pcmData ); writer . end (); }); } async function main () { const ai = new GoogleGenAI ({}); const prompt = `TTS the following conversation between Joe and Jane: Joe: How's it going today Jane? Jane: Not too bad, how about you?` ; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : [{ parts : [{ text : prompt }] }], config : { responseModalities : [ 'AUDIO' ], speechConfig : { multiSpeakerVoiceConfig : { speakerVoiceConfigs : [ { speaker : 'Joe' , voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Kore' } } }, { speaker : 'Jane' , voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Puck' } } } ] } } } }); const data = response . candidates ? .[ 0 ] ? . content ? . parts ? .[ 0 ] ? . inlineData ? . data ; const audioBuffer = Buffer . from ( data , 'base64' ); const fileName = 'out.wav' ; await saveWaveFile ( fileName , audioBuffer ); } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \ -H "x-goog-api-key: \ No newline at end of file diff --git a/docstore/45ee377f-a9ae-4bf0-8eb9-a9c7919f0ebf b/docstore/45ee377f-a9ae-4bf0-8eb9-a9c7919f0ebf new file mode 100644 index 0000000000000000000000000000000000000000..54ff3139001cad531cb76ca5ae25b2688a321ffa --- /dev/null +++ b/docstore/45ee377f-a9ae-4bf0-8eb9-a9c7919f0ebf @@ -0,0 +1 @@ +angle," "worms eye," "dolly shot," "zoom shot," "pan shot," and "tracking shot." Focus and lens effects: Use terms like "shallow focus," "deep focus," "soft focus," "macro lens," and "wide-angle lens" to achieve specific visual effects. Overall style and subject: Guide Veo's creative direction by specifying styles like "sci-fi," "romantic comedy," "action movie," or "animation." You can also describe the subjects and backgrounds you want, such as "cityscape," "nature," "vehicles," or "animals." Veo prompt guide This section of the Veo guide contains examples of videos you can create using Veo, and shows you how to modify prompts to produce distinct results. Safety filters Veo applies safety filters across Gemini to help ensure that generated videos and uploaded photos don't contain offensive content. Prompts that violate our terms and guidelines are blocked. Prompt writing basics Good prompts are descriptive and clear. To get your generated video as close as possible to what you want, start with identifying your core idea, and then refine your idea by adding keywords and modifiers. The following elements should be included in your prompt: Subject : The object, person, animal, or scenery that you want in your video. Context : The background or context in which the subject is placed. Action : What the subject is doing (for example, walking , running , or turning their head ). Style : This can be general or very specific. Consider using specific film style keywords, such as horror film , film noir , or animated styles like cartoon style. Camera motion : [Optional] What the camera is doing, such as aerial view , eye-level , top-down shot , or low-angle shot . Composition : [Optional] How the shot is framed, such as wide shot , close-up , or extreme close-up . Ambiance : [Optional] How the color and light contribute to the scene, such as blue tones , night , or warm tones . More tips for writing prompts The following tips help you write prompts that generate your videos: \ No newline at end of file diff --git a/docstore/45f1abd1-5833-40f1-8d41-b7bad2d04136 b/docstore/45f1abd1-5833-40f1-8d41-b7bad2d04136 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/45f1abd1-5833-40f1-8d41-b7bad2d04136 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/4603fb65-ce25-4195-9fae-6736d9157636 b/docstore/4603fb65-ce25-4195-9fae-6736d9157636 new file mode 100644 index 0000000000000000000000000000000000000000..1426f6277d87da029e324e49b5a4fcb88dde544c --- /dev/null +++ b/docstore/4603fb65-ce25-4195-9fae-6736d9157636 @@ -0,0 +1 @@ +live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Receiving a message before the session disconnects The server sends a GoAway message that signals that the current connection will soon be terminated. This message includes the timeLeft , indicating the remaining time and lets you take further action before the connection will be terminated as ABORTED. Python async for response in session . receive (): if response . go_away is not None : # The connection will soon be terminated print ( response . go_away . time_left ) JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . goAway ) { console . debug ( 'Time left: %s\n' , turn . goAway . timeLeft ); } } Receiving a message when the generation is complete The server sends a generationComplete message that signals that the model finished generating the response. Python async for response in session . receive (): if response . server_content . generation_complete is True : # The generation is complete JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . generationComplete ) { // The generation is complete } } What's next Explore more ways to work with the Live API in the full Capabilities guide, the Tool use page, or the Live API cookbook . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/462782f9-eef1-43da-8ed7-35a4f7846e2a b/docstore/462782f9-eef1-43da-8ed7-35a4f7846e2a new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/462782f9-eef1-43da-8ed7-35a4f7846e2a @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/463620e2-74b7-4556-bcf6-b309c53ba27d b/docstore/463620e2-74b7-4556-bcf6-b309c53ba27d new file mode 100644 index 0000000000000000000000000000000000000000..4a588ba82271aaa08f2df12295049c23bd57f7eb --- /dev/null +++ b/docstore/463620e2-74b7-4556-bcf6-b309c53ba27d @@ -0,0 +1 @@ +, ) print ( response . text ) # The SDK handles the function call and returns the final text You can disable automatic function calling with: Python config = types . GenerateContentConfig ( tools = [ get_current_temperature ], automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ) ) Automatic function schema declaration Automatic schema extraction from Python functions doesn't work in all cases. For example, it doesn't handle cases where you describe the fields of a nested dictionary-object. The API is able to describe any of the following types: Python AllowedType = ( int | float | bool | str | list [ 'AllowedType' ] | dict [ str , AllowedType ]) To see what the inferred schema looks like, you can convert it using from_callable : Python def multiply ( a : float , b : float ): """Returns a * b.""" return a * b fn_decl = types . FunctionDeclaration . from_callable ( callable = multiply , client = client ) # to_json_dict() provides a clean JSON representation. print ( fn_decl . to_json_dict ()) Multi-tool use: Combine native tools with function calling You can enable multiple tools combining native tools with function calling at the same time. Here's an example that enables two tools, Grounding with Google Search and code execution , in a request using the Live API . Note: Multi-tool use is a- Live API only feature at the moment. The run() function declaration, which handles the asynchronous websocket setup, is omitted for brevity. Python # Multiple tasks example - combining lights, code execution, and search prompt = """ Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! """ tools = [ { 'google_search' : {}}, { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} # not defined here. \ No newline at end of file diff --git a/docstore/46392ec6-0a62-4f98-b61c-ee79ed55033b b/docstore/46392ec6-0a62-4f98-b61c-ee79ed55033b new file mode 100644 index 0000000000000000000000000000000000000000..846f589921f766089772715bc1a3853935a191ce --- /dev/null +++ b/docstore/46392ec6-0a62-4f98-b61c-ee79ed55033b @@ -0,0 +1 @@ +batch_status.json ) if [[ $batch_state = "JOB_STATE_SUCCEEDED" ]] ; then if [[ $( jq '.response | has("inlinedResponses")' batch_status.json ) = "true" ]] ; then jq -r '.response.inlinedResponses' batch_status.json exit fi responses_file_name = $( jq -r '.response.responsesFile' batch_status.json ) curl https://generativelanguage.googleapis.com/download/v1beta/ $responses_file_name :download?alt = media \ -H "x-goog-api-key: $GEMINI_API_KEY " 2 > /dev/null elif [[ $batch_state = "JOB_STATE_FAILED" ]] ; then jq '.error' batch_status.json elif [[ $batch_state == "JOB_STATE_CANCELLED" ]] ; then echo "Batch was cancelled by the user" fi Cancelling a batch job You can cancel an ongoing batch job using its name. When a job is canceled, it stops processing new requests. Python # Cancel a batch job client . batches . cancel ( name = batch_job_to_cancel . name ) REST BATCH_NAME = "batches/123456" # Your batch job name # Cancel the batch curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME :cancel \ -H "x-goog-api-key: $GEMINI_API_KEY " \ # Confirm that the status of the batch after cancellation is JOB_STATE_CANCELLED curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type:application/json" 2 > /dev/null | jq -r '.metadata.state' Deleting a batch job You can delete an existing batch job using its name. When a job is deleted, it stops processing new requests and is removed from the list of batch jobs. Python # Delete a batch job client . batches . delete ( name = batch_job_to_delete . name ) REST BATCH_NAME = "batches/123456" # Your batch job name # Cancel the batch curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME :delete \ -H "x-goog-api-key: $GEMINI_API_KEY " \ Technical details Supported models: Batch Mode supports a range of Gemini models. Refer to the Models page for the latest list of compatible models. The supported modalities for Batch Mode are the same as what's \ No newline at end of file diff --git a/docstore/46417d25-3095-455f-9a3f-d03601e95819 b/docstore/46417d25-3095-455f-9a3f-d03601e95819 new file mode 100644 index 0000000000000000000000000000000000000000..665a477ea8352b1598262b3124a473a18fa8289a --- /dev/null +++ b/docstore/46417d25-3095-455f-9a3f-d03601e95819 @@ -0,0 +1 @@ +professional, detailed The following are a few examples of prompts without quality modifiers and the same prompt with quality modifiers. Prompt (no quality modifiers): a photo of a corn stalk Prompt (with quality modifiers): 4k HDR beautiful photo of a corn stalk taken by a professional photographer Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Aspect ratios Imagen image generation lets you set five distinct image aspect ratios. Square (1:1, default) - A standard square photo. Common uses for this aspect ratio include social media posts. Fullscreen (4:3) - This aspect ratio is commonly used in media or film. It is also the dimensions of most old (non-widescreen) TVs and medium format cameras. It captures more of the scene horizontally (compared to 1:1), making it a preferred aspect ratio for photography. Prompt: close up of a musician's fingers playing the piano, black and white film, vintage (4:3 aspect ratio) Prompt: A professional studio photo of french fries for a high end restaurant, in the style of a food magazine (4:3 aspect ratio) Portrait full screen (3:4) - This is the fullscreen aspect ratio rotated 90 degrees. This lets to capture more of the scene vertically compared to the 1:1 aspect ratio. Prompt: a woman hiking, close of her boots reflected in a puddle, large mountains in the background, in the style of an advertisement, dramatic angles (3:4 aspect ratio) Prompt: aerial shot of a river flowing up a mystical valley (3:4 aspect ratio) Widescreen (16:9) - This ratio has replaced 4:3 and is now the most common aspect ratio for TVs, monitors, and mobile phone screens (landscape). Use this aspect ratio when you want to capture more of the background (for example, scenic landscapes). Prompt: a man wearing all white clothing sitting on the beach, close up, golden hour lighting (16:9 aspect ratio) Portrait (9:16) - This ratio is widescreen but rotated. This a relatively new aspect ratio that has been \ No newline at end of file diff --git a/docstore/465070cd-39b9-44a0-9252-9b04e0288e2c b/docstore/465070cd-39b9-44a0-9252-9b04e0288e2c new file mode 100644 index 0000000000000000000000000000000000000000..4781e96bc89cf0be67f0a65c094deb317c17f5b0 --- /dev/null +++ b/docstore/465070cd-39b9-44a0-9252-9b04e0288e2c @@ -0,0 +1 @@ +result , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , genai . Text ( "Tell me about New York?" ), nil ) } Other use cases and platforms Refer to use case specific guides on Gemini Developer API Documentation and Vertex AI documentation for other platforms and use cases. Migration considerations When you migrate: You'll need to use Google Cloud service accounts to authenticate. See the Vertex AI documentation for more information. You can use your existing Google Cloud project (the same one you used to generate your API key) or you can create a new Google Cloud project . Supported regions may differ between the Gemini Developer API and the Vertex AI Gemini API. See the list of supported regions for generative AI on Google Cloud . Any models you created in Google AI Studio need to be retrained in Vertex AI. If you no longer need to use your Gemini API key for the Gemini Developer API, then follow security best practices and delete it. To delete an API key: Open the Google Cloud API Credentials page. Find the API key you want to delete and click the Actions icon. Select Delete API key . In the Delete credential modal, select Delete . Deleting an API key takes a few minutes to propagate. After propagation completes, any traffic using the deleted API key is rejected. Important: If you have deleted a key that is still used in production and need to recover it, see gcloud beta services api-keys undelete . Next steps See the Generative AI on Vertex AI overview to learn more about generative AI solutions on Vertex AI. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-26 UTC. \ No newline at end of file diff --git a/docstore/465cdd56-e02b-4253-9c3e-b3e58c422172 b/docstore/465cdd56-e02b-4253-9c3e-b3e58c422172 new file mode 100644 index 0000000000000000000000000000000000000000..d8826dd5dee907947e5c1cc9cf1f235de6140a7f --- /dev/null +++ b/docstore/465cdd56-e02b-4253-9c3e-b3e58c422172 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/4671ccc8-6ed3-4116-9f48-cdbd761be8b3 b/docstore/4671ccc8-6ed3-4116-9f48-cdbd761be8b3 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/4671ccc8-6ed3-4116-9f48-cdbd761be8b3 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/467ef475-4a98-4537-89b3-9d01a93ffdee b/docstore/467ef475-4a98-4537-89b3-9d01a93ffdee new file mode 100644 index 0000000000000000000000000000000000000000..48ebc0d450e476e2d2310fffefae223b737ab72c --- /dev/null +++ b/docstore/467ef475-4a98-4537-89b3-9d01a93ffdee @@ -0,0 +1 @@ +Because standard Gemini API text and content generation calls are stateless, when using thinking in multi-turn interactions (such as chat), the model doesn't have access to thought context from previous turns. You can maintain thought context using thought signatures, which are encrypted representations of the model's internal thought process. The model returns thought signatures in the response object when thinking and function calling are enabled. To ensure the model maintains context across multiple turns of a conversation, you must provide the thought signatures back to the model in the subsequent requests. You will receive thought signatures when: Thinking is enabled and thoughts are generated. The request includes function declarations . Note: Thought signatures are only available when you're using function calling, specifically, your request must include function declarations . You can find an example of thinking with function calls on the Function calling page. Other usage limitations to consider with function calling include: Signatures are returned from the model within other parts in the response, for example function calling or text parts. Return the entire response with all parts back to the model in subsequent turns. Don't concatenate parts with signatures together. Don't merge one part with a signature with another part without a signature. Pricing Note: Summaries are available in the free and paid tiers of the API. Thought signatures will increase the input tokens you are charged when sent back as part of the request. When thinking is turned on, response pricing is the sum of output tokens and thinking tokens. You can get the total number of generated thinking tokens from the thoughtsTokenCount field. Python # ... print ( "Thoughts tokens:" , response . usage_metadata . thoughts_token_count ) print ( "Output tokens:" , response . usage_metadata . candidates_token_count ) JavaScript // ... console . log ( `Thoughts tokens: ${ response . usageMetadata \ No newline at end of file diff --git a/docstore/468c6e22-431d-4016-b82b-a09ad66b14b4 b/docstore/468c6e22-431d-4016-b82b-a09ad66b14b4 new file mode 100644 index 0000000000000000000000000000000000000000..5f8a5e922d24af531eff4f89e4f99a5736b0820b --- /dev/null +++ b/docstore/468c6e22-431d-4016-b82b-a09ad66b14b4 @@ -0,0 +1 @@ +(`totalTokenCount`). console . log ( generateResult . response . usageMetadata ); // candidatesTokenCount and totalTokenCount depend on response, may vary // { promptTokenCount: 11, candidatesTokenCount: 124, totalTokenCount: 135 } After Python from google import genai client = genai . Client () response = client . models . count_tokens ( model = 'gemini-2.0-flash' , contents = 'The quick brown fox jumps over the lazy dog.' , ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const prompt = "The quick brown fox jumps over the lazy dog." ; const countTokensResponse = await ai . models . countTokens ({ model : "gemini-2.0-flash" , contents : prompt , }); console . log ( countTokensResponse . totalTokens ); const generateResponse = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : prompt , }); console . log ( generateResponse . usageMetadata ); Generate images Generate images: Before Python #pip install https://github.com/google-gemini/generative-ai-python@imagen import google.generativeai as genai imagen = genai . ImageGenerationModel ( "imagen-3.0-generate-001" ) gen_images = imagen . generate_images ( prompt = "Robot holding a red skateboard" , number_of_images = 1 , safety_filter_level = "block_low_and_above" , person_generation = "allow_adult" , aspect_ratio = "3:4" , ) After Python from google import genai client = genai . Client () gen_images = client . models . generate_images ( model = 'imagen-3.0-generate-001' , prompt = 'Robot holding a red skateboard' , config = types . GenerateImagesConfig ( number_of_images = 1 , safety_filter_level = "BLOCK_LOW_AND_ABOVE" , person_generation = "ALLOW_ADULT" , aspect_ratio = "3:4" , ) ) for n , image in enumerate ( gen_images . generated_images ): pathlib . Path ( f ' { n } .png' ) . write_bytes ( image . image . image_bytes ) Embed content Generate content embeddings. Before Python import google.generativeai as genai response \ No newline at end of file diff --git a/docstore/4697c88c-52cf-46d1-9727-e8b74075de4e b/docstore/4697c88c-52cf-46d1-9727-e8b74075de4e new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/4697c88c-52cf-46d1-9727-e8b74075de4e @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/469c4236-c2ad-46c6-893a-ce889b16cf63 b/docstore/469c4236-c2ad-46c6-893a-ce889b16cf63 new file mode 100644 index 0000000000000000000000000000000000000000..44112a5590578bd0d81bd7d1053c465e40ca11dd --- /dev/null +++ b/docstore/469c4236-c2ad-46c6-893a-ce889b16cf63 @@ -0,0 +1 @@ +tokens for other Live API models Supported languages Live API supports the following languages. Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Language BCP-47 Code Language BCP-47 Code German (Germany) de-DE English (Australia)* en-AU English (UK)* en-GB English (India) en-IN English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Portuguese (Brazil) pt-BR Arabic (Generic) ar-XA Spanish (Spain)* es-ES French (Canada)* fr-CA Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Bengali (India) bn-IN Gujarati (India)* gu-IN Kannada (India)* kn-IN Marathi (India) mr-IN Malayalam (India)* ml-IN Tamil (India) ta-IN Telugu (India) te-IN Dutch (Netherlands) nl-NL Korean (South Korea) ko-KR Mandarin Chinese (China)* cmn-CN Polish (Poland) pl-PL Russian (Russia) ru-RU Thai (Thailand) th-TH Languages marked with an asterisk (*) are not available for Native audio . What's next Read the Tool Use and Session Management guides for essential information on using the Live API effectively. Try the Live API in Google AI Studio . For more info about the Live API models, see Gemini 2.0 Flash Live and Gemini 2.5 Flash Native Audio on the Models page. Try more examples in the Live API cookbook , the Live API Tools cookbook , and the Live API Get Started script . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/46b34906-7353-43e2-8cff-da4dc4f16c3a b/docstore/46b34906-7353-43e2-8cff-da4dc4f16c3a new file mode 100644 index 0000000000000000000000000000000000000000..d67ba52d95f30a05ea3b9bffc809074e673ec56e --- /dev/null +++ b/docstore/46b34906-7353-43e2-8cff-da4dc4f16c3a @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/safety-settings#main-content Title: Safety settings | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/46c5ea21-63c2-4e61-9f45-0475d6e11224 b/docstore/46c5ea21-63c2-4e61-9f45-0475d6e11224 new file mode 100644 index 0000000000000000000000000000000000000000..a3fd9d3225fb67d0660508c87d747294298e3c33 --- /dev/null +++ b/docstore/46c5ea21-63c2-4e61-9f45-0475d6e11224 @@ -0,0 +1 @@ +{ "uri" : "https://vertexaisearch.cloud.google.com....." , "title" : "uefa.com" }} ], "groundingSupports" : [ { "segment" : { "startIndex" : 0 , "endIndex" : 85 , "text" : "Spain won Euro 2024, defeatin..." }, "groundingChunkIndices" : [ 0 ] }, { "segment" : { "startIndex" : 86 , "endIndex" : 210 , "text" : "This victory marks Spain's..." }, "groundingChunkIndices" : [ 0 , 1 ] } ] } } ] } The Gemini API returns the following information with the groundingMetadata : webSearchQueries : Array of the search queries used. This is useful for debugging and understanding the model's reasoning process. searchEntryPoint : Contains the HTML and CSS to render the required Search Suggestions. Full usage requirements are detailed in the Terms of Service . groundingChunks : Array of objects containing the web sources ( uri and title ). groundingSupports : Array of chunks to connect model response text to the sources in groundingChunks . Each chunk links a text segment (defined by startIndex and endIndex ) to one or more groundingChunkIndices . This is the key to building inline citations. Grounding with Google Search can also be used in combination with the URL context tool to ground responses in both public web data and the specific URLs you provide. Attributing Sources with inline Citations The API returns structured citation data, giving you complete control over how you display sources in your user interface. You can use the groundingSupports and groundingChunks fields to link the model's statements directly to their sources. Here is a common pattern for processing the metadata to create a response with inline, clickable citations. Python def add_citations ( response ): text = response . text supports = response . candidates [ 0 ] . grounding_metadata . grounding_supports chunks = response . candidates [ 0 ] . grounding_metadata . grounding_chunks # Sort supports by end_index in descending order to avoid shifting issues when inserting. sorted_supports = sorted ( supports , key \ No newline at end of file diff --git a/docstore/46cc2111-24eb-42b4-99ef-85382c029e37 b/docstore/46cc2111-24eb-42b4-99ef-85382c029e37 new file mode 100644 index 0000000000000000000000000000000000000000..a93f111d87cca3d0226fc4cdf96775703aa03950 --- /dev/null +++ b/docstore/46cc2111-24eb-42b4-99ef-85382c029e37 @@ -0,0 +1 @@ +genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMAGE_PATH = "path/to/sample.jpg" MIME_TYPE = $( file -b --mime-type " ${ IMAGE_PATH } " ) NUM_BYTES = $( wc -c < " ${ IMAGE_PATH } " ) DISPLAY_NAME = IMAGE tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Caption this image."}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Prompting with multiple images You can provide multiple images in a \ No newline at end of file diff --git a/docstore/46dde462-d3a4-487f-892b-05553609e52c b/docstore/46dde462-d3a4-487f-892b-05553609e52c new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/46dde462-d3a4-487f-892b-05553609e52c @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/46e69560-a16a-4214-9097-94214f528c6a b/docstore/46e69560-a16a-4214-9097-94214f528c6a new file mode 100644 index 0000000000000000000000000000000000000000..2dd19cc1c2d77863f4a301d62456871308dd74fd --- /dev/null +++ b/docstore/46e69560-a16a-4214-9097-94214f528c6a @@ -0,0 +1 @@ +"temperature" ], }, }, ], }, ]; // Prompt for the model let contents = [ { role : "user" , parts : [ { text : "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , }, ], }, ]; // Loop until the model has no more function calls to make while ( true ) { const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents , config : { tools }, }); if ( result . functionCalls && result . functionCalls . length > 0 ) { const functionCall = result . functionCalls [ 0 ]; const { name , args } = functionCall ; if ( ! toolFunctions [ name ]) { throw new Error ( `Unknown function call: ${ name } ` ); } // Call the function and get the response. const toolResponse = toolFunctions [ name ]( args ); const functionResponsePart = { name : functionCall . name , response : { result : toolResponse , }, }; // Send the function response back to the model. contents . push ({ role : "model" , parts : [ { functionCall : functionCall , }, ], }); contents . push ({ role : "user" , parts : [ { functionResponse : functionResponsePart , }, ], }); } else { // No more function calls, break the loop. console . log ( result . text ); break ; } } Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . It 's 25°C in London, so I' ve set the thermostat to 20 ° C . Compositional function calling is a native Live API feature. This means Live API can handle the function calling similar to the Python SDK. Python # Light control schemas turn_on_the_lights_schema = { 'name' : 'turn_on_the_lights' } \ No newline at end of file diff --git a/docstore/46f4809d-68d2-40d8-9571-cffb0239b3a3 b/docstore/46f4809d-68d2-40d8-9571-cffb0239b3a3 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/46f4809d-68d2-40d8-9571-cffb0239b3a3 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/47124bda-a8df-4c9d-a730-1dd43712461f b/docstore/47124bda-a8df-4c9d-a730-1dd43712461f new file mode 100644 index 0000000000000000000000000000000000000000..b8d7dab8b59ea83c8480687d32380faf07bab32f --- /dev/null +++ b/docstore/47124bda-a8df-4c9d-a730-1dd43712461f @@ -0,0 +1 @@ +google.generativeai as genai # Directly create and use model objects model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ( ... ) chat = model . start_chat ( ... ) JavaScript While GoogleGenerativeAI was a central point for models and chat, other functionalities like file and cache management often required importing and instantiating entirely separate client classes. import { GoogleGenerativeAI } from "@google/generative-ai" ; import { GoogleAIFileManager , GoogleAICacheManager } from "@google/generative-ai/server" ; // For files/caching const genAI = new GoogleGenerativeAI ( "YOUR_API_KEY" ); const fileManager = new GoogleAIFileManager ( "YOUR_API_KEY" ); const cacheManager = new GoogleAICacheManager ( "YOUR_API_KEY" ); // Get a model instance, then call methods on it const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); const result = await model . generateContent (...); const chat = model . startChat (...); // Call methods on separate client objects for other services const uploadedFile = await fileManager . uploadFile (...); const cache = await cacheManager . create (...); Go The genai.NewClient function created a client, but generative model operations were typically called on a separate GenerativeModel instance obtained from this client. Other services might have been accessed via distinct packages or patterns. import ( "github.com/google/generative-ai-go/genai" "github.com/google/generative-ai-go/genai/fileman" // For files "google.golang.org/api/option" ) client , err := genai . NewClient ( ctx , option . WithAPIKey ( "YOUR_API_KEY" )) fileClient , err := fileman . NewClient ( ctx , option . WithAPIKey ( "YOUR_API_KEY" )) // Get a model instance, then call methods on it model := client . GenerativeModel ( "gemini-1.5-flash" ) resp , err := model . GenerateContent ( ... ) cs := model . StartChat () // Call methods on separate client objects for other services uploadedFile , err := fileClient . \ No newline at end of file diff --git a/docstore/471a70ac-74b0-4919-890d-903d01d1c510 b/docstore/471a70ac-74b0-4919-890d-903d01d1c510 new file mode 100644 index 0000000000000000000000000000000000000000..465170b0f695eefaa56d7f3871f4f3df2424da62 --- /dev/null +++ b/docstore/471a70ac-74b0-4919-890d-903d01d1c510 @@ -0,0 +1 @@ +brightness color_temp: Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. Returns: A dictionary containing the set brightness and color temperature. """ return { "brightness" : brightness , "colorTemperature" : color_temp } JavaScript import { Type } from '@google/genai' ; // Define a function that the model can call to control smart lights const setLightValuesFunctionDeclaration = { name : 'set_light_values' , description : 'Sets the brightness and color temperature of a light.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'Light level from 0 to 100. Zero is off and 100 is full brightness' , }, color_temp : { type : Type . STRING , enum : [ 'daylight' , 'cool' , 'warm' ], description : 'Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`.' , }, }, required : [ 'brightness' , 'color_temp' ], }, }; /** * Set the brightness and color temperature of a room light. (mock API) * @param {number} brightness - Light level from 0 to 100. Zero is off and 100 is full brightness * @param {string} color_temp - Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. * @return {Object} A dictionary containing the set brightness and color temperature. */ function setLightValues ( brightness , color_temp ) { return { brightness : brightness , colorTemperature : color_temp }; } Step 2: Call the model with function declarations Once you have defined your function declarations, you can prompt the model to use them. It analyzes the prompt and function declarations and decides whether to respond directly or to call a function. If a function is called, the response object will contain a function call suggestion. Python from google.genai import types # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ set_light_values_declaration ]) config = types . GenerateContentConfig ( tools = [ \ No newline at end of file diff --git a/docstore/47219cc7-db06-4c12-90bb-a4f9f01294f1 b/docstore/47219cc7-db06-4c12-90bb-a4f9f01294f1 new file mode 100644 index 0000000000000000000000000000000000000000..ec6cba9f5d0ceb3b74c56797939372d30da827c9 --- /dev/null +++ b/docstore/47219cc7-db06-4c12-90bb-a4f9f01294f1 @@ -0,0 +1 @@ += "gemini-2.0-flash" , contents = """Generate a short transcript around 100 words that reads like it was clipped from a podcast by excited herpetologists. The hosts names are Dr. Anya and Liam.""" ) . text response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = transcript , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( multi_speaker_voice_config = types . MultiSpeakerVoiceConfig ( speaker_voice_configs = [ types . SpeakerVoiceConfig ( speaker = 'Dr. Anya' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), types . SpeakerVoiceConfig ( speaker = 'Liam' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Puck' , ) ) ), ] ) ) ) ) # ...Code to stream or save the output JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const transcript = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Generate a short transcript around 100 words that reads like it was clipped from a podcast by excited herpetologists. The hosts names are Dr. Anya and Liam." , }) const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : transcript , config : { responseModalities : [ 'AUDIO' ], speechConfig : { multiSpeakerVoiceConfig : { speakerVoiceConfigs : [ { speaker : "Dr. Anya" , voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" }, } }, { speaker : "Liam" , voiceConfig : { prebuiltVoiceConfig : { voiceName : "Puck" }, } } ] } } } }); } // ..JavaScript code for exporting .wav file for output audio await main (); Voice options TTS models support the following 30 voice options in the voice_name field: Zephyr -- Bright Puck -- Upbeat Charon -- Informative Kore -- Firm Fenrir -- Excitable Leda -- Youthful Orus -- Firm \ No newline at end of file diff --git a/docstore/47302aa2-c258-49e4-a266-968a343301e7 b/docstore/47302aa2-c258-49e4-a266-968a343301e7 new file mode 100644 index 0000000000000000000000000000000000000000..10d595bd2c735f8912abb00e69220b9ae90d3d23 --- /dev/null +++ b/docstore/47302aa2-c258-49e4-a266-968a343301e7 @@ -0,0 +1 @@ +Audio understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Audio understanding Gemini can analyze and understand audio input, enabling use cases like the following: Describe, summarize, or answer questions about audio content. Provide a transcription of the audio. Analyze specific segments of the audio. This guide shows you how to use the Gemini API to generate a text response to audio input. Before you begin Before calling the Gemini API, ensure you have your SDK of choice installed, and a Gemini API key configured and ready to use. Input audio You can provide audio data to Gemini in the following ways: Upload an audio file before making a request to generateContent . Pass inline audio data with the request to generateContent . Upload an audio file You can use the Files API to upload an audio file. Always use the Files API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20 MB. The following code uploads an audio file and then uses the file in a call to generateContent . Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp3" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "Describe this audio clip" , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mp3" }, }); const response = await ai . models . generateContent ({ \ No newline at end of file diff --git a/docstore/475e6f9f-c801-40c0-aa45-b63de813d569 b/docstore/475e6f9f-c801-40c0-aa45-b63de813d569 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/475e6f9f-c801-40c0-aa45-b63de813d569 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/4760f27b-79d2-44db-a132-033d741c5a9e b/docstore/4760f27b-79d2-44db-a132-033d741c5a9e new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/4760f27b-79d2-44db-a132-033d741c5a9e @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/476118b0-09dc-4490-9193-79ddc8ad53fc b/docstore/476118b0-09dc-4490-9193-79ddc8ad53fc new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/476118b0-09dc-4490-9193-79ddc8ad53fc @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/47633722-6643-4aa5-a94e-8efc82dd3737 b/docstore/47633722-6643-4aa5-a94e-8efc82dd3737 new file mode 100644 index 0000000000000000000000000000000000000000..a93f111d87cca3d0226fc4cdf96775703aa03950 --- /dev/null +++ b/docstore/47633722-6643-4aa5-a94e-8efc82dd3737 @@ -0,0 +1 @@ +genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMAGE_PATH = "path/to/sample.jpg" MIME_TYPE = $( file -b --mime-type " ${ IMAGE_PATH } " ) NUM_BYTES = $( wc -c < " ${ IMAGE_PATH } " ) DISPLAY_NAME = IMAGE tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Caption this image."}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Prompting with multiple images You can provide multiple images in a \ No newline at end of file diff --git a/docstore/476bc365-cfb1-491a-a953-cf1ed1810b96 b/docstore/476bc365-cfb1-491a-a953-cf1ed1810b96 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/476bc365-cfb1-491a-a953-cf1ed1810b96 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/477b0c4b-1c10-40f1-9d05-ea80ba4dd12d b/docstore/477b0c4b-1c10-40f1-9d05-ea80ba4dd12d new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/477b0c4b-1c10-40f1-9d05-ea80ba4dd12d @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/47832ada-45fc-41f7-9d13-77415bf2ab50 b/docstore/47832ada-45fc-41f7-9d13-77415bf2ab50 new file mode 100644 index 0000000000000000000000000000000000000000..ffa55cd17dc266b0e00c821779e2850dd473d215 --- /dev/null +++ b/docstore/47832ada-45fc-41f7-9d13-77415bf2ab50 @@ -0,0 +1 @@ +"Error: { batch_job . error } " ) Retrieving results Once the job status indicates your batch job has succeeded, the results are available in the response field. Python import json # Use the name of the job you want to check # e.g., inline_batch_job.name from the previous step job_name = "YOUR_BATCH_JOB_NAME" batch_job = client . batches . get ( name = job_name ) if batch_job . state . name == 'JOB_STATE_SUCCEEDED' : # If batch job was created with a file if batch_job . dest and batch_job . dest . file_name : # Results are in a file result_file_name = batch_job . dest . file_name print ( f "Results are in file: { result_file_name } " ) print ( "Downloading result file content..." ) file_content = client . files . download ( file = result_file_name ) # Process file_content (bytes) as needed print ( file_content . decode ( 'utf-8' )) # If batch job was created with inline request elif batch_job . dest and batch_job . dest . inlined_responses : # Results are inline print ( "Results are inline:" ) for i , inline_response in enumerate ( batch_job . dest . inlined_responses ): print ( f "Response { i + 1 } :" ) if inline_response . response : # Accessing response, structure may vary. try : print ( inline_response . response . text ) except AttributeError : print ( inline_response . response ) # Fallback elif inline_response . error : print ( f "Error: { inline_response . error } " ) else : print ( "No results found (neither file nor inline)." ) else : print ( f "Job did not succeed. Final state: { batch_job . state . name } " ) if batch_job . error : print ( f "Error: { batch_job . error } " ) REST BATCH_NAME = "batches/123456" # Your batch job name curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type:application/json" 2 > /dev/null > batch_status.json if jq -r '.done' batch_status.json | grep -q "false" ; then echo "Batch has not finished processing" fi batch_state = $( jq -r '.metadata.state' \ No newline at end of file diff --git a/docstore/478eb903-fa3c-4d4e-9ebd-51b5ebfbfce2 b/docstore/478eb903-fa3c-4d4e-9ebd-51b5ebfbfce2 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/478eb903-fa3c-4d4e-9ebd-51b5ebfbfce2 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/47a336a1-a44e-4daa-bfe5-cbf82f772270 b/docstore/47a336a1-a44e-4daa-bfe5-cbf82f772270 new file mode 100644 index 0000000000000000000000000000000000000000..805d10be613a3c55006267c8f5d9ce17fbca450d --- /dev/null +++ b/docstore/47a336a1-a44e-4daa-bfe5-cbf82f772270 @@ -0,0 +1 @@ +declaration can include the following parameters: name (string): A unique name for the function ( get_weather_forecast , send_email ). Use descriptive names without spaces or special characters (use underscores or camelCase). description (string): A clear and detailed explanation of the function's purpose and capabilities. This is crucial for the model to understand when to use the function. Be specific and provide examples if helpful ("Finds theaters based on location and optionally movie title which is currently playing in theaters."). parameters (object): Defines the input parameters the function expects. type (string): Specifies the overall data type, such as object . properties (object): Lists individual parameters, each with: type (string): The data type of the parameter, such as string , integer , boolean, array . description (string): A description of the parameter's purpose and format. Provide examples and constraints ("The city and state, e.g., 'San Francisco, CA' or a zip code e.g., '95616'."). enum (array, optional): If the parameter values are from a fixed set, use "enum" to list the allowed values instead of just describing them in the description. This improves accuracy ("enum": ["daylight", "cool", "warm"]). required (array): An array of strings listing the parameter names that are mandatory for the function to operate. Function calling with thinking Enabling "thinking" can improve function call performance by allowing the model to reason through a request before suggesting function calls. However, because the Gemini API is stateless, this reasoning context is lost between turns, which can reduce the quality of function calls as they require multiple turn requests. To preserve this context you can use thought signatures. A thought signature is an encrypted representation of the model's internal thought process that you pass back to the model on subsequent turns. To use thought signatures: Receive the signature: When thinking is enabled, the API \ No newline at end of file diff --git a/docstore/47c4dc94-e9ba-4558-93ad-565f14ff82ea b/docstore/47c4dc94-e9ba-4558-93ad-565f14ff82ea new file mode 100644 index 0000000000000000000000000000000000000000..4ce25c45c0956235a2a76b8ce578fbaaad6010c8 --- /dev/null +++ b/docstore/47c4dc94-e9ba-4558-93ad-565f14ff82ea @@ -0,0 +1 @@ +clip" , ]), }); console . log ( response . text ); } await main (); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } defer client . DeleteFile ( ctx , file . Name ) model := client . GenerativeModel ( "gemini-2.0-flash" ) resp , err := model . GenerateContent ( ctx , genai . FileData { URI : file . URI }, genai . Text ( "Describe this audio clip" )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) REST AUDIO_PATH = "path/to/sample.mp3" MIME_TYPE = $( file -b --mime-type " ${ AUDIO_PATH } " ) NUM_BYTES = $( wc -c < " ${ AUDIO_PATH } " ) DISPLAY_NAME = AUDIO tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D " ${ tmp_header_file } " \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ AUDIO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Describe this audio clip"}, {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": \ No newline at end of file diff --git a/docstore/47c5389a-410e-492d-ba0d-9e93d7f0ddfa b/docstore/47c5389a-410e-492d-ba0d-9e93d7f0ddfa new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/47c5389a-410e-492d-ba0d-9e93d7f0ddfa @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/47fd5c42-a347-4857-bb71-870d730ec715 b/docstore/47fd5c42-a347-4857-bb71-870d730ec715 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/47fd5c42-a347-4857-bb71-870d730ec715 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/4809f4a9-b382-4c24-a023-fa599751ffd8 b/docstore/4809f4a9-b382-4c24-a023-fa599751ffd8 new file mode 100644 index 0000000000000000000000000000000000000000..bb8a5c387274cea53762666893b97a549fe37a5c --- /dev/null +++ b/docstore/4809f4a9-b382-4c24-a023-fa599751ffd8 @@ -0,0 +1 @@ +Client () response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = "Provide a list of 3 famous physicists and their key contributions" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 1024 ) # Turn off thinking: # thinking_config=types.ThinkingConfig(thinking_budget=0) # Turn on dynamic thinking: # thinking_config=types.ThinkingConfig(thinking_budget=-1) ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : "Provide a list of 3 famous physicists and their key contributions" , config : { thinkingConfig : { thinkingBudget : 1024 , // Turn off thinking: // thinkingBudget: 0 // Turn on dynamic thinking: // thinkingBudget: -1 }, }, }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "google.golang.org/genai" "os" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } thinkingBudgetVal := int32 ( 1024 ) contents := genai . Text ( "Provide a list of 3 famous physicists and their key contributions" ) model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : & thinkingBudgetVal , // Turn off thinking: // ThinkingBudget: int32(0), // Turn on dynamic thinking: // ThinkingBudget: int32(-1), }, }) fmt . Println ( resp . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Provide a list of 3 famous physicists and their key contributions" } ] } ], "generationConfig": { \ No newline at end of file diff --git a/docstore/4815322d-5743-4496-b5ce-45a134034669 b/docstore/4815322d-5743-4496-b5ce-45a134034669 new file mode 100644 index 0000000000000000000000000000000000000000..71a40ddf8f5869779c79b3a864f86f411dc17d60 --- /dev/null +++ b/docstore/4815322d-5743-4496-b5ce-45a134034669 @@ -0,0 +1 @@ +correct reasoning steps afterward. To disambiguate between those reasons, ask the model to describe what's in the image. In the following example, if the model responds with a snack that seems surprising when paired with tea (e.g. popcorn), you can first troubleshoot to determine whether the model correctly recognized that the image contains tea. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? Describe what's in this image. Another strategy is to ask the model to explain its reasoning. That can help you narrow down which part of the reasoning broke down, if any. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? What's a snack I can make in 1 minute that would go well with this? Please explain why. What's next Try writing your own multimodal prompts using Google AI Studio . For information on using the Gemini Files API for uploading media files and including them in your prompts, see the Vision , Audio , and Document processing guides. For more guidance on prompt design, like tuning sampling parameters, see the Prompt strategies page. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/481deb00-e3b2-415f-9bd1-d3c663b7ddcd b/docstore/481deb00-e3b2-415f-9bd1-d3c663b7ddcd new file mode 100644 index 0000000000000000000000000000000000000000..4ce25c45c0956235a2a76b8ce578fbaaad6010c8 --- /dev/null +++ b/docstore/481deb00-e3b2-415f-9bd1-d3c663b7ddcd @@ -0,0 +1 @@ +clip" , ]), }); console . log ( response . text ); } await main (); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } defer client . DeleteFile ( ctx , file . Name ) model := client . GenerativeModel ( "gemini-2.0-flash" ) resp , err := model . GenerateContent ( ctx , genai . FileData { URI : file . URI }, genai . Text ( "Describe this audio clip" )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) REST AUDIO_PATH = "path/to/sample.mp3" MIME_TYPE = $( file -b --mime-type " ${ AUDIO_PATH } " ) NUM_BYTES = $( wc -c < " ${ AUDIO_PATH } " ) DISPLAY_NAME = AUDIO tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D " ${ tmp_header_file } " \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ AUDIO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Describe this audio clip"}, {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": \ No newline at end of file diff --git a/docstore/483ba071-4c7e-4541-b35b-d5213e99fe42 b/docstore/483ba071-4c7e-4541-b35b-d5213e99fe42 new file mode 100644 index 0000000000000000000000000000000000000000..1b8db702d1402c9f81138189f71a133485ad1327 --- /dev/null +++ b/docstore/483ba071-4c7e-4541-b35b-d5213e99fe42 @@ -0,0 +1 @@ +URL context | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback URL context Experimental: The URL context tool is an experimental feature. Using the URL context tool, you can provide Gemini with URLs as additional context for your prompt. The model can then retrieve content from the URLs and use that content to inform and shape its response. This tool is useful for tasks like the following: Extracting key data points or talking points from articles Comparing information across multiple links Synthesizing data from several sources Answering questions based on the content of a specific page or pages Analyzing content for specific purposes (like writing a job description or creating test questions) This guide explains how to use the URL context tool in the Gemini API. Use URL context You can use the URL context tool in two main ways, by itself or in conjunction with Grounding with Google Search . URL Context Only You provide specific URLs that you want the model to analyze directly in your prompt. Example prompts: Summarize this document: YOUR_URLs Extract the key features from the product description on this page: YOUR_URLs Grounding with Google Search + URL Context You can also enable both URL context and Grounding with Google Search together. You can enter a prompt with or without URLs. The model may first search for relevant information and then use the URL context tool to read the content of the search results for a more in-depth understanding. Example prompts: Give me three day events schedule based on YOUR_URL . Also let me know what needs to taken care of considering weather and commute. Recommend 3 books for beginners to read to \ No newline at end of file diff --git a/docstore/484d925a-7c01-4f6a-89ae-e99c346ea63f b/docstore/484d925a-7c01-4f6a-89ae-e99c346ea63f new file mode 100644 index 0000000000000000000000000000000000000000..8711f29609bdb3bc24fbc7c22d5a2b29ce171b6a --- /dev/null +++ b/docstore/484d925a-7c01-4f6a-89ae-e99c346ea63f @@ -0,0 +1 @@ +affiliates. Last updated 2025-02-25 UTC. \ No newline at end of file diff --git a/docstore/4858b0d0-6e61-4f1a-a4d2-3418d295adf3 b/docstore/4858b0d0-6e61-4f1a-a4d2-3418d295adf3 new file mode 100644 index 0000000000000000000000000000000000000000..f80ba77256b88992731ca23f287dfcea3e13a0e2 --- /dev/null +++ b/docstore/4858b0d0-6e61-4f1a-a4d2-3418d295adf3 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/quickstart Title: Gemini API quickstart | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/485ff2dd-5c86-4fcb-9534-00cc285d0137 b/docstore/485ff2dd-5c86-4fcb-9534-00cc285d0137 new file mode 100644 index 0000000000000000000000000000000000000000..cd1a6469d32d4344455628e24b4f24d47cbf3ee6 --- /dev/null +++ b/docstore/485ff2dd-5c86-4fcb-9534-00cc285d0137 @@ -0,0 +1 @@ +new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "List a few popular cookie recipes, and include the amounts of ingredients." , config : { responseMimeType : "application/json" , responseSchema : { type : Type . ARRAY , items : { type : Type . OBJECT , properties : { recipeName : { type : Type . STRING , }, ingredients : { type : Type . ARRAY , items : { type : Type . STRING , }, }, }, propertyOrdering : [ "recipeName" , "ingredients" ], }, }, }, }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { ResponseMIMEType : "application/json" , ResponseSchema : & genai . Schema { Type : genai . TypeArray , Items : & genai . Schema { Type : genai . TypeObject , Properties : map [ string ] * genai . Schema { "recipeName" : { Type : genai . TypeString }, "ingredients" : { Type : genai . TypeArray , Items : & genai . Schema { Type : genai . TypeString }, }, }, PropertyOrdering : [] string { "recipeName" , "ingredients" }, }, }, } result , err := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "List a few popular cookie recipes, and include the amounts of ingredients." ), config , ) if err != nil { log . Fatal ( err ) } fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "contents": [{ "parts":[ { "text": "List a few popular cookie recipes, and include the amounts of ingredients." } ] }], "generationConfig": { "responseMimeType": "application/json", "responseSchema": { "type": "ARRAY", "items": { "type": "OBJECT", "properties": { "recipeName": { \ No newline at end of file diff --git a/docstore/4878f8a2-9c5f-48fb-a8f6-ee61a244ff0e b/docstore/4878f8a2-9c5f-48fb-a8f6-ee61a244ff0e new file mode 100644 index 0000000000000000000000000000000000000000..9b6431ae9c97257d5ff4628dd401f203e2f83eb3 --- /dev/null +++ b/docstore/4878f8a2-9c5f-48fb-a8f6-ee61a244ff0e @@ -0,0 +1 @@ +Use curl to send a POST request to the predictLongRunning endpoint. # The request body includes the prompt for video generation. curl " ${ BASE_URL } /models/veo-2.0-generate-001:predictLongRunning" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -X "POST" \ -d '{ "instances": [{ "prompt": "Panning wide shot of a calico kitten sleeping in the sunshine" } ], "parameters": { "aspectRatio": "16:9", "personGeneration": "dont_allow", } }' | tee result.json | jq .name | sed 's/"//g' > op_name # Obtain operation name to download video. op_name = $( cat op_name ) # Check against status of operation. while true ; do is_done = $( curl -H "x-goog-api-key: $GEMINI_API_KEY " " ${ BASE_URL } / ${ op_name } " | tee op_check.json | jq .done ) if [ " ${ is_done } " = "true" ] ; then cat op_check.json echo "** Attach API_KEY to download video, or examine error message." break fi echo "** Video ${ op_name } has not downloaded yet! Check again after 5 seconds..." # Wait for 5 seoncds to check again. sleep 5 done This code takes about 2-3 minutes to run, though it may take longer if resources are constrained. Once it's done running, you should see a video that looks something like this: If you see an error message instead of a video, this means that resources are constrained and your request couldn't be completed. In this case, run the code again. Generated videos are stored on the server for 2 days, after which they are removed. If you want to save a local copy of your generated video, you must run result() and save() within 2 days of generation. Generate from images You can also generate videos using images. The following code generates an image using Imagen, then uses the generated image as the starting frame for the generated video. First, generate an image using Imagen : Python prompt = "Panning wide shot of a calico kitten sleeping in the sunshine" , imagen = client . models . generate_images ( model = "imagen-3.0-generate-002" , prompt = prompt , \ No newline at end of file diff --git a/docstore/48a3f3d8-1257-4d62-b92e-e54502b0d8b3 b/docstore/48a3f3d8-1257-4d62-b92e-e54502b0d8b3 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/48a3f3d8-1257-4d62-b92e-e54502b0d8b3 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/48acb590-d3ee-4d4a-bd75-20fcf606320b b/docstore/48acb590-d3ee-4d4a-bd75-20fcf606320b new file mode 100644 index 0000000000000000000000000000000000000000..87bffe59c9f4797f4d0611e74f88436457c136bc --- /dev/null +++ b/docstore/48acb590-d3ee-4d4a-bd75-20fcf606320b @@ -0,0 +1 @@ +unlock new use cases. Some emerging and standard use cases for text based long context include: Summarizing large corpuses of text Previous summarization options with smaller context models would require a sliding window or another technique to keep state of previous sections as new tokens are passed to the model Question and answering Historically this was only possible with RAG given the limited amount of context and models' factual recall being low Agentic workflows Text is the underpinning of how agents keep state of what they have done and what they need to do; not having enough information about the world and the agent's goal is a limitation on the reliability of agents Many-shot in-context learning is one of the most unique capabilities unlocked by long context models. Research has shown that taking the common "single shot" or "multi-shot" example paradigm, where the model is presented with one or a few examples of a task, and scaling that up to hundreds, thousands, or even hundreds of thousands of examples, can lead to novel model capabilities. This many-shot approach has also been shown to perform similarly to models which were fine-tuned for a specific task. For use cases where a Gemini model's performance is not yet sufficient for a production rollout, you can try the many-shot approach. As you might explore later in the long context optimization section, context caching makes this type of high input token workload much more economically feasible and even lower latency in some cases. Long form video Video content's utility has long been constrained by the lack of accessibility of the medium itself. It was hard to skim the content, transcripts often failed to capture the nuance of a video, and most tools don't process image, text, and audio together. With Gemini, the long-context text capabilities translate to the ability to reason and answer questions about multimodal inputs with sustained performance. Some emerging and standard use cases for video long \ No newline at end of file diff --git a/docstore/48c86b46-b2ba-4f22-8966-c9647dfb7e4e b/docstore/48c86b46-b2ba-4f22-8966-c9647dfb7e4e new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/48c86b46-b2ba-4f22-8966-c9647dfb7e4e @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/48cba495-f89c-486e-8952-11fefe792b4f b/docstore/48cba495-f89c-486e-8952-11fefe792b4f new file mode 100644 index 0000000000000000000000000000000000000000..48ce7760ed3b3e078bbb96293e0e67132c5a10c7 --- /dev/null +++ b/docstore/48cba495-f89c-486e-8952-11fefe792b4f @@ -0,0 +1 @@ +Video understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Video understanding Gemini models can process videos, enabling many frontier developer use cases that would have historically required domain specific models. Some of Gemini's vision capabilities include the ability to: Describe, segment, and extract information from videos Answer questions about video content Refer to specific timestamps within a video Gemini was built to be multimodal from the ground up and we continue to push the frontier of what is possible. This guide shows how to use the Gemini API to generate text responses based on video inputs. Video input You can provide videos as input to Gemini in the following ways: Upload a video file using the File API before making a request to generateContent . Use this method for files larger than 20MB, videos longer than approximately 1 minute, or when you want to reuse the file across multiple requests. Pass inline video data with the request to generateContent . Use this method for smaller files (<20MB) and shorter durations. Include a YouTube URL directly in the prompt. Upload a video file You can use the Files API to upload a video file. Always use the Files API when the total request size (including the file, text prompt, system instructions, etc.) is larger than 20 MB, the video duration is significant, or if you intend to use the same video in multiple prompts. The File API accepts video file formats directly. This example uses the short NASA film "Jupiter's Great Red Spot Shrinks and Grows" . Credit: Goddard Space Flight Center (GSFC)/David Ladd (2018). "Jupiter's Great Red Spot Shrinks and Grows" is in the \ No newline at end of file diff --git a/docstore/48d41395-4d7b-4504-8875-cc05624aeb29 b/docstore/48d41395-4d7b-4504-8875-cc05624aeb29 new file mode 100644 index 0000000000000000000000000000000000000000..b58c189db2b124c61872134bdd4b3d786a4e18e6 --- /dev/null +++ b/docstore/48d41395-4d7b-4504-8875-cc05624aeb29 @@ -0,0 +1 @@ +you can read about configuring function calling . Python from google import genai from google.genai import types # Configure the client and tools client = genai . Client () house_tools = [ types . Tool ( function_declarations = [ power_disco_ball , start_music , dim_lights ]) ] config = types . GenerateContentConfig ( tools = house_tools , automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ), # Force the model to call 'any' function, instead of chatting. tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = 'ANY' ) ), ) chat = client . chats . create ( model = "gemini-2.5-flash" , config = config ) response = chat . send_message ( "Turn this place into a party!" ) # Print out each of the function calls requested from this single call print ( "Example 1: Forced function calling" ) for fn in response . function_calls : args = ", " . join ( f " { key } = { val } " for key , val in fn . args . items ()) print ( f " { fn . name } ( { args } )" ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Set up function declarations const houseFns = [ powerDiscoBall , startMusic , dimLights ]; const config = { tools : [{ functionDeclarations : houseFns }], // Force the model to call 'any' function, instead of chatting. toolConfig : { functionCallingConfig : { mode : 'any' } } }; // Configure the client const ai = new GoogleGenAI ({}); // Create a chat session const chat = ai . chats . create ({ model : 'gemini-2.5-flash' , config : config }); const response = await chat . sendMessage ({ message : 'Turn this place into a party!' }); // Print out each of the function calls requested from this single call console . log ( "Example 1: Forced function calling" ); for ( const fn of response . functionCalls ) { const args = Object . entries ( fn . args ) . map (([ key , val ]) = > ` ${ key } = ${ val } ` ) . join ( ', ' ); console . log ( ` ${ fn . name } ( ${ args } )` ); } Each of the printed \ No newline at end of file diff --git a/docstore/48ea8047-7121-461e-8419-96f8dd250573 b/docstore/48ea8047-7121-461e-8419-96f8dd250573 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/48ea8047-7121-461e-8419-96f8dd250573 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/48f19775-441b-4963-9124-62d6bbb3b6e7 b/docstore/48f19775-441b-4963-9124-62d6bbb3b6e7 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/48f19775-441b-4963-9124-62d6bbb3b6e7 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/491703e3-35be-4eaa-adbd-9a0d4208c48c b/docstore/491703e3-35be-4eaa-adbd-9a0d4208c48c new file mode 100644 index 0000000000000000000000000000000000000000..d028e190ded4732dff341fe647fe7532c210272b --- /dev/null +++ b/docstore/491703e3-35be-4eaa-adbd-9a0d4208c48c @@ -0,0 +1 @@ +second image shows an empty hallway in a shopping mall, which is a place where people often go to buy things for their trips. The third image shows a stack of toilet paper, which is something that people often use when they are traveling. The fourth image shows a dark and stormy sea, which is something that people often encounter when they are traveling. The same tips work for creative output from images. In the following example with the rabbit, explicitly asking the model to include aspects from the image can improve the output. Prompt Model response Can you write me a poem about this image? Little bunny, so soft and brown, Hopping through the field, up and down. Your ears so long, your tail so small, You're the cutest creature of them all. You nibble on the grass, so green and sweet, And hop away when you see me. But I'll always be your friend, Little bunny, until the very end. Updated prompt Improved response Can you write me a descriptive and dramatic poem about this image and include the location? Little bunny on the green, Ears so long, so keen. What do you see out there, In the distance, where you stare? Is it the ocean blue, Sparkling in the sun's hue? Or the waves that crash and roar, A rhythm that you adore? Little bunny, tell me true, What captures your attention so? Is it the salty breeze, Or the seagulls' cries that please? Perhaps it's the vastness of it all, A world so big and small. A place where dreams can roam, And adventures can be sown. Little bunny, I wonder what you dream, As you sit on the grass, so serene. Do you yearn to explore the deep, Or stay on land, where you can leap? Whatever it may be, little bunny, Keep that spark of wonder burning brightly. For in your dreams and aspirations, Lies a world waiting for your creation. Troubleshooting which part of the prompt failed It can be hard to know whether a prompt failed because the model didn't understand the image to begin with, or if it did understand the image but did not perform the \ No newline at end of file diff --git a/docstore/491e6428-9bf4-4f12-b11f-dad6a3e11028 b/docstore/491e6428-9bf4-4f12-b11f-dad6a3e11028 new file mode 100644 index 0000000000000000000000000000000000000000..1fd617a587d76016a0c4d5b56098be9076683928 --- /dev/null +++ b/docstore/491e6428-9bf4-4f12-b11f-dad6a3e11028 @@ -0,0 +1 @@ +candidates [ 0 ] . content ) # Append the content from the model's response. contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Create a function response part const function_response_part = { name : tool_call . name , response : { result } } // Append function call and result of the function execution to contents contents . push ( response . candidates [ 0 ]. content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); // Get the final response from the model const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); This completes the function calling flow. The model successfully used the set_light_values function to perform the request action of the user. Function declarations When you implement function calling in a prompt, you create a tools object, which contains one or more function declarations . You define functions using JSON, specifically with a select subset of the OpenAPI schema format. A single function declaration can include the following parameters: name (string): A unique name for the function ( get_weather_forecast , send_email ). Use descriptive names without spaces or special characters (use underscores or camelCase). description (string): A clear and detailed explanation of the function's purpose and capabilities. This is crucial for the model to understand when to use the function. Be specific and provide examples if helpful ("Finds theaters based on location and optionally movie title which is currently playing in theaters."). parameters (object): Defines the input parameters the function expects. type (string): Specifies the overall data type, \ No newline at end of file diff --git a/docstore/494aa275-8364-4e42-9eb7-1e9095883b49 b/docstore/494aa275-8364-4e42-9eb7-1e9095883b49 new file mode 100644 index 0000000000000000000000000000000000000000..0d6f2bf32b918d3f85636fdb1c53070e3d5509f6 --- /dev/null +++ b/docstore/494aa275-8364-4e42-9eb7-1e9095883b49 @@ -0,0 +1 @@ +overlay_filename = f " { item [ 'label' ] } _ { i } _overlay.png" mask . save ( os . path . join ( output_dir , mask_filename )) # Create and save overlay composite = Image . alpha_composite ( im . convert ( 'RGBA' ), overlay ) composite . save ( os . path . join ( output_dir , overlay_filename )) print ( f "Saved mask and overlay for { item [ 'label' ] } to { output_dir } " ) # Example usage if __name__ == "__main__" : extract_segmentation_masks ( "path/to/image.png" ) Check the segmentation example in the cookbook guide for a more detailed example. An example segmentation output with objects and segmentation masks Supported image formats Gemini supports the following image format MIME types: PNG - image/png JPEG - image/jpeg WEBP - image/webp HEIC - image/heic HEIF - image/heif Capabilities All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation. Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements. Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities: Gemini 2.0 models are further trained to support enhanced object detection . Gemini 2.5 models are further trained to support enhanced segmentation in addition to object detection . Limitations and key technical information File limit Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request. Token calculation Gemini 1.5 Flash and Gemini 1.5 Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens. Gemini 2.0 Flash and Gemini 2.5 Flash/Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each \ No newline at end of file diff --git a/docstore/4956bef9-d0c5-46a0-8f75-e29ddfafa31e b/docstore/4956bef9-d0c5-46a0-8f75-e29ddfafa31e new file mode 100644 index 0000000000000000000000000000000000000000..91fd6dbcb4d807434a341e23a941ef0850298bc5 --- /dev/null +++ b/docstore/4956bef9-d0c5-46a0-8f75-e29ddfafa31e @@ -0,0 +1 @@ +transcript' , config = types . GenerateContentConfig ( cached_content = apollo_cache . name , ) ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const filePath = path . join ( media , "a11.txt" ); const document = await ai . files . upload ({ file : filePath , config : { mimeType : "text/plain" }, }); console . log ( "Uploaded file name:" , document . name ); const modelName = "gemini-1.5-flash" ; const contents = [ createUserContent ( createPartFromUri ( document . uri , document . mimeType )), ]; const cache = await ai . caches . create ({ model : modelName , config : { contents : contents , systemInstruction : "You are an expert analyzing transcripts." , }, }); console . log ( "Cache created:" , cache ); const response = await ai . models . generateContent ({ model : modelName , contents : "Please summarize this transcript" , config : { cachedContent : cache . name }, }); console . log ( "Response text:" , response . text ); Count tokens Count the number of tokens in a request. Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . count_tokens ( 'The quick brown fox jumps over the lazy dog.' ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY+); const model = genAI.getGenerativeModel({ model: " gemini - 1.5 - flash ", }); // Count tokens in a prompt without calling text generation. const countResult = await model.countTokens( " The quick brown fox jumps over the lazy dog . ", ); console.log(countResult.totalTokens); // 11 const generateResult = await model.generateContent( " The quick brown fox jumps over the lazy dog . " , ); // On the response for `generateContent`, use `usageMetadata` // to get separate input and output token counts // (`promptTokenCount` and `candidatesTokenCount`, respectively), // as well as the combined token count \ No newline at end of file diff --git a/docstore/495935a0-50d0-45df-968a-90dd3533db94 b/docstore/495935a0-50d0-45df-968a-90dd3533db94 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/495935a0-50d0-45df-968a-90dd3533db94 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/495ee4b4-7311-40c3-ab50-6a746fbc0c3f b/docstore/495ee4b4-7311-40c3-ab50-6a746fbc0c3f new file mode 100644 index 0000000000000000000000000000000000000000..48ebc0d450e476e2d2310fffefae223b737ab72c --- /dev/null +++ b/docstore/495ee4b4-7311-40c3-ab50-6a746fbc0c3f @@ -0,0 +1 @@ +Because standard Gemini API text and content generation calls are stateless, when using thinking in multi-turn interactions (such as chat), the model doesn't have access to thought context from previous turns. You can maintain thought context using thought signatures, which are encrypted representations of the model's internal thought process. The model returns thought signatures in the response object when thinking and function calling are enabled. To ensure the model maintains context across multiple turns of a conversation, you must provide the thought signatures back to the model in the subsequent requests. You will receive thought signatures when: Thinking is enabled and thoughts are generated. The request includes function declarations . Note: Thought signatures are only available when you're using function calling, specifically, your request must include function declarations . You can find an example of thinking with function calls on the Function calling page. Other usage limitations to consider with function calling include: Signatures are returned from the model within other parts in the response, for example function calling or text parts. Return the entire response with all parts back to the model in subsequent turns. Don't concatenate parts with signatures together. Don't merge one part with a signature with another part without a signature. Pricing Note: Summaries are available in the free and paid tiers of the API. Thought signatures will increase the input tokens you are charged when sent back as part of the request. When thinking is turned on, response pricing is the sum of output tokens and thinking tokens. You can get the total number of generated thinking tokens from the thoughtsTokenCount field. Python # ... print ( "Thoughts tokens:" , response . usage_metadata . thoughts_token_count ) print ( "Output tokens:" , response . usage_metadata . candidates_token_count ) JavaScript // ... console . log ( `Thoughts tokens: ${ response . usageMetadata \ No newline at end of file diff --git a/docstore/49708ffd-46ce-44c4-99ef-a58540a35a56 b/docstore/49708ffd-46ce-44c4-99ef-a58540a35a56 new file mode 100644 index 0000000000000000000000000000000000000000..5f25eb2a53a9afab2cc27675039b1ff3f0e2b594 --- /dev/null +++ b/docstore/49708ffd-46ce-44c4-99ef-a58540a35a56 @@ -0,0 +1 @@ +suitable for production use. Review ephemeral tokens guide for more information. Consider adding restrictions to your key: You can limit a key's permissions by adding API key restrictions . This minimizes the potential damage if the key is ever leaked. For some general best practices, you can also review this support article . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/498bfc75-720d-4d1c-8805-0c8aa87aaf11 b/docstore/498bfc75-720d-4d1c-8805-0c8aa87aaf11 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/498bfc75-720d-4d1c-8805-0c8aa87aaf11 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/4991d03e-130b-4192-8cba-97e8f210c68c b/docstore/4991d03e-130b-4192-8cba-97e8f210c68c new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/4991d03e-130b-4192-8cba-97e8f210c68c @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/49bc6be0-e552-4a07-85cb-fe7b76570722 b/docstore/49bc6be0-e552-4a07-85cb-fe7b76570722 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/49bc6be0-e552-4a07-85cb-fe7b76570722 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/49c1f32a-0b99-4369-a234-b55dc7e15144 b/docstore/49c1f32a-0b99-4369-a234-b55dc7e15144 new file mode 100644 index 0000000000000000000000000000000000000000..4a588ba82271aaa08f2df12295049c23bd57f7eb --- /dev/null +++ b/docstore/49c1f32a-0b99-4369-a234-b55dc7e15144 @@ -0,0 +1 @@ +, ) print ( response . text ) # The SDK handles the function call and returns the final text You can disable automatic function calling with: Python config = types . GenerateContentConfig ( tools = [ get_current_temperature ], automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ) ) Automatic function schema declaration Automatic schema extraction from Python functions doesn't work in all cases. For example, it doesn't handle cases where you describe the fields of a nested dictionary-object. The API is able to describe any of the following types: Python AllowedType = ( int | float | bool | str | list [ 'AllowedType' ] | dict [ str , AllowedType ]) To see what the inferred schema looks like, you can convert it using from_callable : Python def multiply ( a : float , b : float ): """Returns a * b.""" return a * b fn_decl = types . FunctionDeclaration . from_callable ( callable = multiply , client = client ) # to_json_dict() provides a clean JSON representation. print ( fn_decl . to_json_dict ()) Multi-tool use: Combine native tools with function calling You can enable multiple tools combining native tools with function calling at the same time. Here's an example that enables two tools, Grounding with Google Search and code execution , in a request using the Live API . Note: Multi-tool use is a- Live API only feature at the moment. The run() function declaration, which handles the asynchronous websocket setup, is omitted for brevity. Python # Multiple tasks example - combining lights, code execution, and search prompt = """ Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! """ tools = [ { 'google_search' : {}}, { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} # not defined here. \ No newline at end of file diff --git a/docstore/49c75593-5129-4ba1-beb2-f2fb7c829897 b/docstore/49c75593-5129-4ba1-beb2-f2fb7c829897 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/49c75593-5129-4ba1-beb2-f2fb7c829897 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/49c8c226-74c9-449e-9766-42dc4ce3fd18 b/docstore/49c8c226-74c9-449e-9766-42dc4ce3fd18 new file mode 100644 index 0000000000000000000000000000000000000000..76b3241c42effba70a7ac847bf09514ad7de11cd --- /dev/null +++ b/docstore/49c8c226-74c9-449e-9766-42dc4ce3fd18 @@ -0,0 +1 @@ +not specified, the model can choose from any of the provided function declarations. If allowed_function_names is provided as a list, the model can only choose from the functions in that list. Use this mode when you require a function call response to every prompt (if applicable). NONE : The model is prohibited from making function calls. This is equivalent to sending a request without any function declarations. Use this to temporarily disable function calling without removing your tool definitions. Python from google.genai import types # Configure function calling mode tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = "ANY" , allowed_function_names = [ "get_current_temperature" ] ) ) # Create the generation config config = types . GenerateContentConfig ( tools = [ tools ], # not defined here. tool_config = tool_config , ) JavaScript import { FunctionCallingConfigMode } from '@google/genai' ; // Configure function calling mode const toolConfig = { functionCallingConfig : { mode : FunctionCallingConfigMode . ANY , allowedFunctionNames : [ 'get_current_temperature' ] } }; // Create the generation config const config = { tools : tools , // not defined here. toolConfig : toolConfig , }; Automatic function calling (Python only) When using the Python SDK, you can provide Python functions directly as tools. The SDK automatically converts the Python function to declarations, handles the function call execution and the response cycle for you. The Python SDK then automatically: Detects function call responses from the model. Call the corresponding Python function in your code. Sends the function response back to the model. Returns the model's final text response. To use this, define your function with type hints and a docstring, and then pass the function itself (not a JSON declaration) as a tool: Python from google import genai from google.genai import types # Define the function with type hints and docstring def \ No newline at end of file diff --git a/docstore/49d83d6c-ec3d-43b5-b17a-ce35f94be602 b/docstore/49d83d6c-ec3d-43b5-b17a-ce35f94be602 new file mode 100644 index 0000000000000000000000000000000000000000..3c1b7a7f28e24faa230d65b7608a5733d7c64407 --- /dev/null +++ b/docstore/49d83d6c-ec3d-43b5-b17a-ce35f94be602 @@ -0,0 +1 @@ +single prompt by including multiple image Part objects in the contents array. These can be a mix of inline data (local files or URLs) and File API references. Python from google import genai from google.genai import types client = genai . Client () # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client . files . upload ( file = image1_path ) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open ( image2_path , 'rb' ) as f : img2_bytes = f . read () # Create the prompt with text and multiple images response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is different between these two images?" , uploaded_file , # Use the uploaded file reference types . Part . from_bytes ( data = img2_bytes , mime_type = 'image/png' ) ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); async function main () { // Upload the first image const image1_path = "path/to/image1.jpg" ; const uploadedFile = await ai . files . upload ({ file : image1_path , config : { mimeType : "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png" ; const base64Image2File = fs . readFileSync ( image2_path , { encoding : "base64" , }); // Create the prompt with text and multiple images const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ "What is different between these two images?" , createPartFromUri ( uploadedFile . uri , uploadedFile . mimeType ), { inlineData : { mimeType : "image/png" , data : base64Image2File , }, }, ]), }); console . log ( response . text ); } await main (); Go // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile , _ := client . Files . UploadFromPath ( ctx , image1Path , nil ) // Prepare the second image as inline \ No newline at end of file diff --git a/docstore/49f8beff-d4de-4220-b081-56a4039b223e b/docstore/49f8beff-d4de-4220-b081-56a4039b223e new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/49f8beff-d4de-4220-b081-56a4039b223e @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/49feed03-cbe9-449a-88a9-4b6cea2671be b/docstore/49feed03-cbe9-449a-88a9-4b6cea2671be new file mode 100644 index 0000000000000000000000000000000000000000..33a8b238b28b3b4e6fb2252f6f1e5e7807510cc2 --- /dev/null +++ b/docstore/49feed03-cbe9-449a-88a9-4b6cea2671be @@ -0,0 +1 @@ +used to create the audio response: Native audio : This option provides the most natural and realistic-sounding speech and better multilingual performance. It also enables advanced features like affective (emotion-aware) dialogue , proactive audio (where the model can decide to ignore or respond to certain inputs), and "thinking" . Native audio is supported by the following native audio models : gemini-2.5-flash-preview-native-audio-dialog gemini-2.5-flash-exp-native-audio-thinking-dialog Half-cascade audio : This option uses a cascaded model architecture (native audio input and text-to-speech output). It offers better performance and reliability in production environments, especially with tool use . Half-cascaded audio is supported by the following models: gemini-live-2.5-flash-preview gemini-2.0-flash-live-001 Choose an implementation approach When integrating with Live API, you'll need to choose one of the following implementation approaches: Server-to-server : Your backend connects to the Live API using WebSockets . Typically, your client sends stream data (audio, video, text) to your server, which then forwards it to the Live API. Client-to-server : Your frontend code connects directly to the Live API using WebSockets to stream data, bypassing your backend. Note: Client-to-server generally offers better performance for streaming audio and video, since it bypasses the need to send the stream to your backend first. It's also easier to set up since you don't need to implement a proxy that sends data from your client to your server and then your server to the API. However, for production environments, in order to mitigate security risks, we recommend using ephemeral tokens instead of standard API keys. Get started This example reads a WAV file , sends it in the correct format, and saves the received data as WAV file. You can send audio by converting it to 16-bit PCM, 16kHz, mono format, and you can receive audio by setting AUDIO as response modality. The output uses \ No newline at end of file diff --git a/docstore/4a0e4bea-ec5c-46bf-9d2c-e0e137ad6785 b/docstore/4a0e4bea-ec5c-46bf-9d2c-e0e137ad6785 new file mode 100644 index 0000000000000000000000000000000000000000..0a87c36dd70c54ff062f2d96388cf5344fb768a1 --- /dev/null +++ b/docstore/4a0e4bea-ec5c-46bf-9d2c-e0e137ad6785 @@ -0,0 +1 @@ +process the response and check for Function Call, if Yes : Extract the name and args of the function and execute the corresponding function in your application. No: The model has provided a direct text response to the prompt (this flow is less emphasized in the example but is a possible outcome). Create User friendly response: If a function was executed, capture the result and send it back to the model in a subsequent turn of the conversation. It will use the result to generate a final, user-friendly response that incorporates the information from the function call. This process can be repeated over multiple turns, allowing for complex interactions and workflows. The model also supports calling multiple functions in a single turn ( parallel function calling ) and in sequence ( compositional function calling ). Step 1: Define a function declaration Define a function and its declaration within your application code that allows users to set light values and make an API request. This function could call external services or APIs. Python # Define a function that the model can call to control smart lights set_light_values_declaration = { "name" : "set_light_values" , "description" : "Sets the brightness and color temperature of a light." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "integer" , "description" : "Light level from 0 to 100. Zero is off and 100 is full brightness" , }, "color_temp" : { "type" : "string" , "enum" : [ "daylight" , "cool" , "warm" ], "description" : "Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`." , }, }, "required" : [ "brightness" , "color_temp" ], }, } # This is the actual function that would be called based on the model's suggestion def set_light_values ( brightness : int , color_temp : str ) - > dict [ str , int | str ]: """Set the brightness and color temperature of a room light. (mock API). Args: brightness: Light level from 0 to 100. Zero is off and 100 is full \ No newline at end of file diff --git a/docstore/4a22ffd4-6615-4dbc-8ac4-8fa5db732e15 b/docstore/4a22ffd4-6615-4dbc-8ac4-8fa5db732e15 new file mode 100644 index 0000000000000000000000000000000000000000..cb7319bb995c708a315638a1177ba448f5c39210 --- /dev/null +++ b/docstore/4a22ffd4-6615-4dbc-8ac4-8fa5db732e15 @@ -0,0 +1 @@ +: "Powers the spinning disco ball." , "parameters" : { "type" : "object" , "properties" : { "power" : { "type" : "boolean" , "description" : "Whether to turn the disco ball on or off." , } }, "required" : [ "power" ], }, } start_music = { "name" : "start_music" , "description" : "Play some music matching the specified parameters." , "parameters" : { "type" : "object" , "properties" : { "energetic" : { "type" : "boolean" , "description" : "Whether the music is energetic or not." , }, "loud" : { "type" : "boolean" , "description" : "Whether the music is loud or not." , }, }, "required" : [ "energetic" , "loud" ], }, } dim_lights = { "name" : "dim_lights" , "description" : "Dim the lights." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "number" , "description" : "The brightness of the lights, 0.0 is off, 1.0 is full." , } }, "required" : [ "brightness" ], }, } JavaScript import { Type } from '@google/genai' ; const powerDiscoBall = { name : 'power_disco_ball' , description : 'Powers the spinning disco ball.' , parameters : { type : Type . OBJECT , properties : { power : { type : Type . BOOLEAN , description : 'Whether to turn the disco ball on or off.' } }, required : [ 'power' ] } }; const startMusic = { name : 'start_music' , description : 'Play some music matching the specified parameters.' , parameters : { type : Type . OBJECT , properties : { energetic : { type : Type . BOOLEAN , description : 'Whether the music is energetic or not.' }, loud : { type : Type . BOOLEAN , description : 'Whether the music is loud or not.' } }, required : [ 'energetic' , 'loud' ] } }; const dimLights = { name : 'dim_lights' , description : 'Dim the lights.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'The brightness of the lights, 0.0 is off, 1.0 is full.' } }, required : [ 'brightness' ] } }; Configure the function calling mode to allow using all of the specified tools. To learn more, \ No newline at end of file diff --git a/docstore/4a3bc98c-c4fa-4648-881f-334cd9c494a0 b/docstore/4a3bc98c-c4fa-4648-881f-334cd9c494a0 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/4a3bc98c-c4fa-4648-881f-334cd9c494a0 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/4a4f62c9-decb-4258-8f21-6c3fc8c36e4b b/docstore/4a4f62c9-decb-4258-8f21-6c3fc8c36e4b new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/4a4f62c9-decb-4258-8f21-6c3fc8c36e4b @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/4a623b6e-52e0-446d-b607-9e05f16cbe48 b/docstore/4a623b6e-52e0-446d-b607-9e05f16cbe48 new file mode 100644 index 0000000000000000000000000000000000000000..eb9021b0c62d8225572ffa88a8581263b0a8ad78 --- /dev/null +++ b/docstore/4a623b6e-52e0-446d-b607-9e05f16cbe48 @@ -0,0 +1 @@ +and their capabilities, visit the Models page. Best practices Prompting tips For basic text generation, a zero-shot prompt often suffices without needing examples, system instructions or specific formatting. For more tailored outputs: Use System instructions to guide the model. Provide few example inputs and outputs to guide the model. This is often referred to as few-shot prompting. Consult our prompt engineering guide for more tips. Structured output In some cases, you may need structured output, such as JSON. Refer to our structured output guide to learn how. What's next Try the Gemini API getting started Colab . Explore Gemini's image , video , audio and document understanding capabilities. Learn about multimodal file prompting strategies . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/4a7bce75-c1dc-4219-9212-ccbd4d61bbc0 b/docstore/4a7bce75-c1dc-4219-9212-ccbd4d61bbc0 new file mode 100644 index 0000000000000000000000000000000000000000..44a3473d72d583ff18190b31caba4ebb17636b6e --- /dev/null +++ b/docstore/4a7bce75-c1dc-4219-9212-ccbd4d61bbc0 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#model-versions Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/4a9a4af9-2f13-4ed4-b8cd-9763e22edfef b/docstore/4a9a4af9-2f13-4ed4-b8cd-9763e22edfef new file mode 100644 index 0000000000000000000000000000000000000000..c839e4b299fa83f191461c51a3897f429d1b3fab --- /dev/null +++ b/docstore/4a9a4af9-2f13-4ed4-b8cd-9763e22edfef @@ -0,0 +1 @@ +Response: {'status': 'success'}" ); return { status : "success" }; } const toolFunctions = { get_weather_forecast , set_thermostat_temperature , }; const tools = [ { functionDeclarations : [ { name : "get_weather_forecast" , description : "Gets the current weather temperature for a given location." , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , }, }, required : [ "location" ], }, }, { name : "set_thermostat_temperature" , description : "Sets the thermostat to a desired temperature." , parameters : { type : Type . OBJECT , properties : { temperature : { type : Type . NUMBER , }, }, required : [ "temperature" ], }, }, ], }, ]; // Prompt for the model let contents = [ { role : "user" , parts : [ { text : "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , }, ], }, ]; // Loop until the model has no more function calls to make while ( true ) { const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents , config : { tools }, }); if ( result . functionCalls && result . functionCalls . length > 0 ) { const functionCall = result . functionCalls [ 0 ]; const { name , args } = functionCall ; if ( ! toolFunctions [ name ]) { throw new Error ( `Unknown function call: ${ name } ` ); } // Call the function and get the response. const toolResponse = toolFunctions [ name ]( args ); const functionResponsePart = { name : functionCall . name , response : { result : toolResponse , }, }; // Send the function response back to the model. contents . push ({ role : "model" , parts : [ { functionCall : functionCall , }, ], }); contents . push ({ role : "user" , parts : [ { functionResponse : functionResponsePart , }, ], }); } else { // No more function calls, break the loop. console . log ( result . text ); break ; } } Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the \ No newline at end of file diff --git a/docstore/4aa08b0e-244f-480a-aafb-9d591f1fe1cb b/docstore/4aa08b0e-244f-480a-aafb-9d591f1fe1cb new file mode 100644 index 0000000000000000000000000000000000000000..a7a45ba33fb66e79b4a3cb3ed45264da6b432660 --- /dev/null +++ b/docstore/4aa08b0e-244f-480a-aafb-9d591f1fe1cb @@ -0,0 +1 @@ +Gemini API quickstart | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini API quickstart This quickstart shows you how to install our libraries and make your first Gemini API request. Before you begin You need a Gemini API key. If you don't already have one, you can get it for free in Google AI Studio . Install the Google GenAI SDK Python Using Python 3.9+ , install the google-genai package using the following pip command : pip install -q -U google-genai JavaScript Using Node.js v18+ , install the Google Gen AI SDK for TypeScript and JavaScript using the following npm command : npm install @google/genai Go Install google.golang.org/genai in your module directory using the go get command : go get google.golang.org/genai Java If you're using Maven, you can install google-genai by adding the following to your dependencies: com.google.genai google-genai 1.0.0 Apps Script To create a new Apps Script project, go to script.new . Click Untitled project . Rename the Apps Script project AI Studio and click Rename . Set your API key At the left, click Project Settings . Under Script Properties click Add script property . For Property , enter the key name: GEMINI_API_KEY . For Value , enter the value for the API key. Click Save script properties . Replace the Code.gs file contents with the following code: Make your first request Here is an example that uses the generateContent method to send a request to the Gemini API using the Gemini 2.5 Flash model. If you set your API key as the environment variable GEMINI_API_KEY , it will be \ No newline at end of file diff --git a/docstore/4aa41225-73fe-468b-926a-32306eabe2b8 b/docstore/4aa41225-73fe-468b-926a-32306eabe2b8 new file mode 100644 index 0000000000000000000000000000000000000000..e07af4c129416412473bec4efb7206866a211496 --- /dev/null +++ b/docstore/4aa41225-73fe-468b-926a-32306eabe2b8 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/system-instructions#main-content Title: Text generation | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/4ac278d4-1356-4dbe-b017-653a75d562ea b/docstore/4ac278d4-1356-4dbe-b017-653a75d562ea new file mode 100644 index 0000000000000000000000000000000000000000..7e8ad5dab4abf30afa5f2a7a1bb2bb58fcb2f5d0 --- /dev/null +++ b/docstore/4ac278d4-1356-4dbe-b017-653a75d562ea @@ -0,0 +1 @@ +-X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"' " $MIME_TYPE " '", "data": "' " $IMAGE_B64 " '" } }, {"text": "Caption this image."} ] }] }' 2 > /dev/null Note: Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly. Uploading images using the File API For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to generateContent . See the Files API guide for more information and examples. Python from google import genai client = genai . Client () my_file = client . files . upload ( file = "path/to/sample.jpg" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ my_file , "Caption this image." ], ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.jpg" , config : { mimeType : "image/jpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Caption this image." , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.jpg" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Caption this image." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * \ No newline at end of file diff --git a/docstore/4addebe4-fac1-4211-9f8b-27b1e722025b b/docstore/4addebe4-fac1-4211-9f8b-27b1e722025b new file mode 100644 index 0000000000000000000000000000000000000000..ddc1ec68807ed0017d00c5153db6b826d6e2aced --- /dev/null +++ b/docstore/4addebe4-fac1-4211-9f8b-27b1e722025b @@ -0,0 +1 @@ +"GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); async function main () { const model = await openai . models . retrieve ( "gemini-2.0-flash" ); console . log ( model . id ); } main (); REST curl https://generativelanguage.googleapis.com/v1beta/openai/models/gemini-2.0-flash \ -H "Authorization: Bearer GEMINI_API_KEY" Current limitations Support for the OpenAI libraries is still in beta while we extend feature support. If you have questions about supported parameters, upcoming features, or run into any issues getting started with Gemini, join our Developer Forum . What's next Try our OpenAI Compatibility Colab to work through more detailed examples. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-18 UTC. \ No newline at end of file diff --git a/docstore/4af90d2e-c657-49d2-a6e0-7e8ad6590a46 b/docstore/4af90d2e-c657-49d2-a6e0-7e8ad6590a46 new file mode 100644 index 0000000000000000000000000000000000000000..233275c24e69f5305092ac0cfec3fe1548f8c01a --- /dev/null +++ b/docstore/4af90d2e-c657-49d2-a6e0-7e8ad6590a46 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/function-calling?example=weather#main-content Title: Function calling with the Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/4b10516f-ef89-4e44-a073-73dfcbce85ad b/docstore/4b10516f-ef89-4e44-a073-73dfcbce85ad new file mode 100644 index 0000000000000000000000000000000000000000..fc6b27785ffc2bef06b5f2067401216e3ff34ff1 --- /dev/null +++ b/docstore/4b10516f-ef89-4e44-a073-73dfcbce85ad @@ -0,0 +1 @@ +"cheeseburger" was excluded from the output because it wasn't a part of the order. Constraints Specify any constraints on reading the prompt or generating a response. You can tell the model what to do and not to do. For example, you can specify a constraint in the prompt on how long you want a summary to be: Prompt: Summarize this text in one sentence: Text: A quantum computer exploits quantum mechanical phenomena to perform calculations exponentially faster than any modern traditional computer. At very tiny scales, physical matter acts as both particles and as waves, and quantum computing uses specialized hardware to leverage this behavior. The operating principles of quantum devices is beyond the scope of classical physics. When deployed at scale, quantum computers could be used in a wide variety of applications such as: in cybersecurity to break existing encryption methods while helping researchers create new ones, in meteorology to develop better weather forecasting etc. However, the current state of the art quantum computers are still largely experimental and impractical. Response: Exploiting quantum mechanical phenomena, quantum computers can perform calculations exponentially faster than traditional computers for potential applications like cybersecurity and meteorology, although they are currently largely experimental and impractical. (gemini-2.5-flash) Response format You can give instructions that specify the format of the response. For example, you can ask for the response to be formatted as a table, bulleted list, elevator pitch, keywords, sentence, or paragraph. The following system instruction tells the model to be more conversational in its response: System instruction All questions should be answered comprehensively with details, unless the user requests a concise response specifically. Prompt What is a smart way to make a business that sells DVD's in 2025? Response: Starting a business selling DVDs in 2025 presents significant challenges due to the \ No newline at end of file diff --git a/docstore/4b23b455-048a-465a-935c-e0e3fea4f197 b/docstore/4b23b455-048a-465a-935c-e0e3fea4f197 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/4b23b455-048a-465a-935c-e0e3fea4f197 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/4b568c57-dbd8-4e17-81ec-ced508bf01cf b/docstore/4b568c57-dbd8-4e17-81ec-ced508bf01cf new file mode 100644 index 0000000000000000000000000000000000000000..3d0efcbd852506bcdcffe96143ffad9326aef9eb --- /dev/null +++ b/docstore/4b568c57-dbd8-4e17-81ec-ced508bf01cf @@ -0,0 +1 @@ +from google import genai client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'Tell me a story in 300 words.' ) print ( response . text ) print ( response . model_dump_json ( exclude_none = True , indent = 4 )) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Tell me a story in 300 words." , }); console . log ( response . text ); Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , genai . Text ( "Tell me a story in 300 words." ), nil ) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing result Image Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ([ 'Tell me a story based on this image' , Image . open ( image_path ) ]) print ( response . text ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); function fileToGenerativePart ( path , mimeType ) { return { inlineData : { data : Buffer . from ( fs . readFileSync ( path )). toString ( "base64" ), mimeType , }, }; } const prompt = "Tell me a story based on this image" ; const imagePart = fileToGenerativePart ( `path/to/organ.jpg` , "image/jpeg" , ); const result = await model . generateContent ([ prompt , imagePart ]); console . log ( result . response . text ()); Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( \ No newline at end of file diff --git a/docstore/4b5e814c-0b5e-40f9-aed7-52a0bf559ee3 b/docstore/4b5e814c-0b5e-40f9-aed7-52a0bf559ee3 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/4b5e814c-0b5e-40f9-aed7-52a0bf559ee3 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/4b7d5f51-a4a9-4e62-9755-a0f170a8650b b/docstore/4b7d5f51-a4a9-4e62-9755-a0f170a8650b new file mode 100644 index 0000000000000000000000000000000000000000..8f7945b0bd22308457df570e5259c405fbd173ad --- /dev/null +++ b/docstore/4b7d5f51-a4a9-4e62-9755-a0f170a8650b @@ -0,0 +1 @@ +and 100 is full brightness' , }, color_temp : { type : Type . STRING , enum : [ 'daylight' , 'cool' , 'warm' ], description : 'Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`.' , }, }, required : [ 'brightness' , 'color_temp' ], }, }; /** * Set the brightness and color temperature of a room light. (mock API) * @param {number} brightness - Light level from 0 to 100. Zero is off and 100 is full brightness * @param {string} color_temp - Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. * @return {Object} A dictionary containing the set brightness and color temperature. */ function setLightValues ( brightness , color_temp ) { return { brightness : brightness , colorTemperature : color_temp }; } Step 2: Call the model with function declarations Once you have defined your function declarations, you can prompt the model to use them. It analyzes the prompt and function declarations and decides whether to respond directly or to call a function. If a function is called, the response object will contain a function call suggestion. Python from google.genai import types # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ set_light_values_declaration ]) config = types . GenerateContentConfig ( tools = [ tools ]) # Define user prompt contents = [ types . Content ( role = "user" , parts = [ types . Part ( text = "Turn the lights down to a romantic level" )] ) ] # Send request with function declarations response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = contents config = config , ) print ( response . candidates [ 0 ] . content . parts [ 0 ] . function_call ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Generation config with function declaration const config = { tools : [{ functionDeclarations : [ setLightValuesFunctionDeclaration ] }] }; // Configure the client const ai = new GoogleGenAI ({}); // Define user \ No newline at end of file diff --git a/docstore/4b902eb2-6730-466e-a0b4-b122d4d925b6 b/docstore/4b902eb2-6730-466e-a0b4-b122d4d925b6 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/4b902eb2-6730-466e-a0b4-b122d4d925b6 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/4ba305fb-aee5-4f44-bad1-77f2750d6891 b/docstore/4ba305fb-aee5-4f44-bad1-77f2750d6891 new file mode 100644 index 0000000000000000000000000000000000000000..46b1ab716068a90ca8b9aaaffe42e5334bcea2c0 --- /dev/null +++ b/docstore/4ba305fb-aee5-4f44-bad1-77f2750d6891 @@ -0,0 +1 @@ +Batch Mode | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Batch Mode The Gemini API's Batch Mode is designed to process large volumes of requests asynchronously at 50% of the standard cost . The target turnaround time is 24 hours, but in majority of cases, it is much quicker. Use Batch Mode for large-scale, non-urgent tasks such as data pre-processing or running evaluations where an immediate response is not required. Note: You can use Batch Mode with the Gemini API Python SDK or the REST API. Support for Batch Mode in the Gemini API JavaScript SDK is coming soon. Getting Started This section helps you get started with submitting your first requests in batch mode. Creating a batch job You have two ways to submit your requests in Batch Mode: Inline Requests : A list of GenerateContentRequest objects directly included in your batch creation request. This is suitable for smaller batches that keep the total request size under 20MB. The output returned from the model is a list of inlineResponse objects. Input File : A JSON Lines (JSONL) file where each line contains a complete GenerateContentRequest object. This method is recommended for larger requests. The output returned from the model is a JSONL file where each line is either a GenerateContentResponse or a status object. Inline requests For a small number of requests, you can directly embed the GenerateContentRequest objects within your BatchGenerateContentRequest . The following example calls the BatchGenerateContent method with inline requests: Python from google import genai from google.genai import types client = genai . Client () # A list of dictionaries, where each is a \ No newline at end of file diff --git a/docstore/4bb729bb-a8e9-450a-b9db-c3139fd8c6d0 b/docstore/4bb729bb-a8e9-450a-b9db-c3139fd8c6d0 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/4bb729bb-a8e9-450a-b9db-c3139fd8c6d0 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/4bbbad04-1979-4a54-8773-5ee4179216df b/docstore/4bbbad04-1979-4a54-8773-5ee4179216df new file mode 100644 index 0000000000000000000000000000000000000000..2eb49f25f00f2219a7e934ab66119088cdf8ccce --- /dev/null +++ b/docstore/4bbbad04-1979-4a54-8773-5ee4179216df @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-2.0-flash-preview-image-generation Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/4bc07f4e-8099-4439-8792-5e26f034a0fc b/docstore/4bc07f4e-8099-4439-8792-5e26f034a0fc new file mode 100644 index 0000000000000000000000000000000000000000..83ae5cdb2af6e799ba55894dddc7bdf98a552a64 --- /dev/null +++ b/docstore/4bc07f4e-8099-4439-8792-5e26f034a0fc @@ -0,0 +1 @@ +Gabon The Gambia Georgia Germany Ghana Gibraltar Greece Greenland Grenada Guam Guatemala Guernsey Guinea Guinea-Bissau Guyana Haiti Heard Island and McDonald Islands Herzegovina Honduras Hungary Iceland India Indonesia Iraq Ireland Isle of Man Israel Italy Jamaica Japan Jersey Jordan Kazakhstan Kenya Kiribati Kosovo Kyrgyzstan Kuwait Laos Latvia Lebanon Lesotho Liberia Libya Liechtenstein Lithuania Luxembourg Madagascar Malawi Malaysia Maldives Mali Malta Marshall Islands Mauritania Mauritius Mexico Micronesia Mongolia Montenegro Montserrat Morocco Mozambique Namibia Nauru Nepal Netherlands New Caledonia New Zealand Nicaragua Niger Nigeria Niue Norfolk Island North Macedonia Northern Mariana Islands Norway Oman Pakistan Palau Palestine Panama Papua New Guinea Paraguay Peru Philippines Pitcairn Islands Poland Portugal Puerto Rico Qatar Republic of Cyprus Republic of the Congo Romania Rwanda Saint Barthélemy Saint Kitts and Nevis Saint Lucia Saint Pierre and Miquelon Saint Vincent and the Grenadines Saint Helena, Ascension and Tristan da Cunha Samoa São Tomé and Príncipe Saudi Arabia Senegal Serbia Seychelles Sierra Leone Singapore Slovakia Slovenia Solomon Islands Somalia South Africa South Georgia and the South Sandwich Islands South Korea South Sudan Spain Sri Lanka Sudan Suriname Sweden Switzerland Taiwan Tajikistan Tanzania Thailand Timor-Leste Togo Tokelau Tonga Trinidad and Tobago Tunisia Türkiye Turkmenistan Turks and Caicos Islands Tuvalu Uganda Ukraine United Kingdom United Arab Emirates United States United States Minor Outlying Islands U.S. Virgin Islands Uruguay Uzbekistan Vanuatu Venezuela Vietnam Wallis and Futuna Western Sahara Yemen Zambia Zimbabwe Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its \ No newline at end of file diff --git a/docstore/4be16182-f368-48e3-bc88-1efa31ce0b2c b/docstore/4be16182-f368-48e3-bc88-1efa31ce0b2c new file mode 100644 index 0000000000000000000000000000000000000000..98faf91375c6b4714b16a322e35e7fc208134d27 --- /dev/null +++ b/docstore/4be16182-f368-48e3-bc88-1efa31ce0b2c @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/image-understanding#main-content Title: Image understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/4be5f274-b146-43e0-9a87-b37a875bd4f5 b/docstore/4be5f274-b146-43e0-9a87-b37a875bd4f5 new file mode 100644 index 0000000000000000000000000000000000000000..750217269b5a3e53dd2d92de822b68418a6799df --- /dev/null +++ b/docstore/4be5f274-b146-43e0-9a87-b37a875bd4f5 @@ -0,0 +1 @@ +' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Get metadata for a file You can verify that the API successfully stored the uploaded file and get its metadata by calling files.get . Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) file_name = myfile . name myfile = client . files . get ( name = file_name ) print ( myfile ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; const fetchedFile = await ai . files . get ({ name : fileName }); console . log ( fetchedFile ); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } gotFile , err := client . GetFile ( ctx , file . Name ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Got file:" , gotFile . Name ) REST # file_info.json was created in the upload example name = $( jq ".file.name" file_info.json ) # Get the file of interest to check state curl https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " > file_info.json # Print some information about the file you got name = $( jq ".file.name" file_info.json ) echo name = $name file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri List uploaded files You can upload multiple files using the Files API. The following code gets a list of all the files uploaded: Python print ( 'My files:' ) for f in client . files . list (): print ( ' ' , f . name ) JavaScript const listResponse = await ai . files . list ({ config : { pageSize : 10 } }); for await ( const file of listResponse ) { console . log ( file . name ); } Go iter := client . ListFiles ( ctx ) for { ifile , err := iter . Next () if err == iterator . Done { break } if err != nil { log . Fatal ( err ) } fmt . Println ( ifile . Name ) } REST echo "My files: " curl \ No newline at end of file diff --git a/docstore/4befd0bf-4b3d-4a56-8db5-57f20a71a3b9 b/docstore/4befd0bf-4b3d-4a56-8db5-57f20a71a3b9 new file mode 100644 index 0000000000000000000000000000000000000000..fde5008e10da059aa2ac847e9fab5e369116574b --- /dev/null +++ b/docstore/4befd0bf-4b3d-4a56-8db5-57f20a71a3b9 @@ -0,0 +1 @@ +You can set fields as required to force the model to provide a value. If there's insufficient context in the associated input prompt, the model generates responses mainly based on the data it was trained on. A complex schema can result in an InvalidArgument: 400 error. Complexity might come from long property names, long array length limits, enums with many values, objects with lots of optional properties, or a combination of these factors. If you get this error with a valid schema, make one or more of the following changes to resolve the error: Shorten property names or enum names. Flatten nested arrays. Reduce the number of properties with constraints, such as numbers with minimum and maximum limits. Reduce the number of properties with complex constraints, such as properties with complex formats like date-time . Reduce the number of optional properties. Reduce the number of valid values for enums. If you aren't seeing the results you expect, add more context to your input prompts or revise your response schema. For example, review the model's response without structured output to see how the model responds. You can then update your response schema so that it better fits the model's output. What's next Now that you've learned how to generate structured output, you might want to try using Gemini API tools: Function calling Code execution Grounding with Google Search Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/4c1c95f7-c425-4c57-ac32-e5299169d7e2 b/docstore/4c1c95f7-c425-4c57-ac32-e5299169d7e2 new file mode 100644 index 0000000000000000000000000000000000000000..ea67dfb950f5bf1f1ddc040da181eb1accb5974b --- /dev/null +++ b/docstore/4c1c95f7-c425-4c57-ac32-e5299169d7e2 @@ -0,0 +1 @@ +breakdown:" ) for detail in usage . response_tokens_details : match detail : case types . ModalityTokenCount ( modality = modality , token_count = count ): print ( f " { modality } : { count } " ) JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . usageMetadata ) { console . debug ( 'Used %s tokens in total. Response token breakdown:\n' , turn . usageMetadata . totalTokenCount ); for ( const detail of turn . usageMetadata . responseTokensDetails ) { console . debug ( '%s\n' , detail ); } } } Media resolution You can specify the media resolution for the input media by setting the mediaResolution field as part of the session configuration: Python from google.genai import types config = { "response_modalities" : [ "AUDIO" ], "media_resolution" : types . MediaResolution . MEDIA_RESOLUTION_LOW , } JavaScript import { GoogleGenAI , Modality , MediaResolution } from '@google/genai' ; const config = { responseModalities : [ Modality . TEXT ], mediaResolution : MediaResolution . MEDIA_RESOLUTION_LOW , }; Limitations Consider the following limitations of the Live API when you plan your project. Response modalities You can only set one response modality ( TEXT or AUDIO ) per session in the session configuration. Setting both results in a config error message. This means that you can configure the model to respond with either text or audio, but not both in the same session. Client authentication The Live API only provides server-to-server authentication by default. If you're implementing your Live API application using a client-to-server approach , you need to use ephemeral tokens to mitigate security risks. Session duration Audio-only sessions are limited to 15 minutes, and audio plus video sessions are limited to 2 minutes. However, you can configure different session management techniques for unlimited extensions on session duration. Context window A session has a context window limit of: 128k tokens for native audio output models 32k \ No newline at end of file diff --git a/docstore/4c24e81e-93e8-4e5e-8e54-cf39efc5d380 b/docstore/4c24e81e-93e8-4e5e-8e54-cf39efc5d380 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/4c24e81e-93e8-4e5e-8e54-cf39efc5d380 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/4c34a818-7bbd-4e85-95b2-c803042d39fa b/docstore/4c34a818-7bbd-4e85-95b2-c803042d39fa new file mode 100644 index 0000000000000000000000000000000000000000..4a8a7222dfc27acfaa73b21a084913914a78851b --- /dev/null +++ b/docstore/4c34a818-7bbd-4e85-95b2-c803042d39fa @@ -0,0 +1 @@ +"fmt" "google.golang.org/genai" "os" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := genai . Text ( "What is the sum of the first 50 prime numbers?" ) model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { IncludeThoughts : true , }, }) for _ , part := range resp . Candidates [ 0 ]. Content . Parts { if part . Text != "" { if part . Thought { fmt . Println ( "Thoughts Summary:" ) fmt . Println ( part . Text ) } else { fmt . Println ( "Answer:" ) fmt . Println ( part . Text ) } } } } And here is an example using thinking with streaming, which returns rolling, incremental summaries during generation: Python from google import genai from google.genai import types client = genai . Client () prompt = """ Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own? """ thoughts = "" answer = "" for chunk in client . models . generate_content_stream ( model = "gemini-2.5-pro" , contents = prompt , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( include_thoughts = True ) ) ): for part in chunk . candidates [ 0 ] . content . parts : if not part . text : continue elif part . thought : if not thoughts : print ( "Thoughts summary:" ) print ( part . text ) thoughts += part . text else : if not answer : print ( "Answer:" ) print ( part . text ) answer += part . text JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const prompt = `Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The \ No newline at end of file diff --git a/docstore/4c3f53b4-6f83-48ab-9bc5-3c61fa610476 b/docstore/4c3f53b4-6f83-48ab-9bc5-3c61fa610476 new file mode 100644 index 0000000000000000000000000000000000000000..47c113cd9bacf84a434d531b6a240f0025b74130 --- /dev/null +++ b/docstore/4c3f53b4-6f83-48ab-9bc5-3c61fa610476 @@ -0,0 +1 @@ +. For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC. \ No newline at end of file diff --git a/docstore/4c43701d-a22d-4ee1-b176-51a67552e380 b/docstore/4c43701d-a22d-4ee1-b176-51a67552e380 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/4c43701d-a22d-4ee1-b176-51a67552e380 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/4c4954fd-df8b-4358-89be-f652341cf212 b/docstore/4c4954fd-df8b-4358-89be-f652341cf212 new file mode 100644 index 0000000000000000000000000000000000000000..91fc78231d919db46ae330599c0572b8fce05ec3 --- /dev/null +++ b/docstore/4c4954fd-df8b-4358-89be-f652341cf212 @@ -0,0 +1 @@ +( "Summarize this document" ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } Uploading PDFs using the File API You can use the File API to upload larger documents. Always use the File API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20MB. Note: The File API lets you store up to 50MB of PDF files. Files are stored for 48 hours. You can access them in that period with your API key, but you can't download them from the API. The File API is available at no cost in all regions where the Gemini API is available. Call media.upload to upload a file using the File API. The following code uploads a document file and then uses the file in a call to models.generateContent . Large PDFs from URLs Use the File API to simplify uploading and processing large PDF files from URLs: Python from google import genai from google.genai import types import io import httpx client = genai . Client () long_context_pdf_path = "https://www.nasa.gov/wp-content/uploads/static/history/alsj/a17/A17_FlightPlan.pdf" # Retrieve and upload the PDF using the File API doc_io = io . BytesIO ( httpx . get ( long_context_pdf_path ) . content ) sample_doc = client . files . upload ( # You can pass a path or a file-like object here file = doc_io , config = dict ( mime_type = 'application/pdf' ) ) prompt = "Summarize this document" response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ sample_doc , prompt ]) print ( response . text ) JavaScript import { createPartFromUri , GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); async function main () { const pdfBuffer = await fetch ( "https://www.nasa.gov/wp-content/uploads/static/history/alsj/a17/A17_FlightPlan.pdf" ) . then (( response ) = > \ No newline at end of file diff --git a/docstore/4c4addb9-e9cb-4cbc-aceb-20108dc58555 b/docstore/4c4addb9-e9cb-4cbc-aceb-20108dc58555 new file mode 100644 index 0000000000000000000000000000000000000000..a6b24f2239a055f1d65a9fc8ecf3b6c3213634db --- /dev/null +++ b/docstore/4c4addb9-e9cb-4cbc-aceb-20108dc58555 @@ -0,0 +1 @@ +min | 12/2/2023 | 12/4/2023 | | Physics | In progress | 2 hrs | 12/3/2023 | 12/5/2023 | | Biology | Not started | 6 hrs | 11/2/2016 | 12/6/2023 | Prompt Model response Provide a list of all the following attributes: ingredients, type of cuisine, vegetarian or not, in JSON format ```json { "ingredients": [ "rice", "shrimp", "clams", "mussels", "peas", "tomatoes", "onions", "garlic", "olive oil", "paprika", "salt", "pepper" ], "type of cuisine": "Spanish", "vegetarian": false } ``` Put your image first for single-image prompts While Gemini can interpret image and text in any order within a prompt, placing a single image before the text prompt might lead to better results. In the following samples, you'll notice the image comes first before the text in each prompt. Troubleshooting your multimodal prompt You might need to troubleshoot your prompt if you are not getting a helpful response. Here are a few strategies you could try. If the model is not drawing information from the relevant part of the image To get a more specific response, you can point out which aspects of the image you want the prompt to draw information from to inform its response. Prompt Model response How many days will these diapers last a baby? The diapers will last for 198 days before they run out. Updated prompt Improved response How long will these diapers last before I run out? Use the weight shown on the box to determine the child's age, and use the total number of diapers in the box. Divide the total number by how many diapers the child goes through per day. The box contains 198 diapers. The box states the diapers are for a child weighing 22-37 lbs. The average child in this weight range is likely between 1 and 2 years old. A 2-year-old typically goes through 6 diapers per day, therefore, the diapers will last around 33 days (198/6). If the model output is too generic and not tailored enough to the image input To help the model tailor its response to the image(s), try asking it to describe the \ No newline at end of file diff --git a/docstore/4c80cf55-cf48-49c3-bee0-84b51c53ddb9 b/docstore/4c80cf55-cf48-49c3-bee0-84b51c53ddb9 new file mode 100644 index 0000000000000000000000000000000000000000..805d10be613a3c55006267c8f5d9ce17fbca450d --- /dev/null +++ b/docstore/4c80cf55-cf48-49c3-bee0-84b51c53ddb9 @@ -0,0 +1 @@ +declaration can include the following parameters: name (string): A unique name for the function ( get_weather_forecast , send_email ). Use descriptive names without spaces or special characters (use underscores or camelCase). description (string): A clear and detailed explanation of the function's purpose and capabilities. This is crucial for the model to understand when to use the function. Be specific and provide examples if helpful ("Finds theaters based on location and optionally movie title which is currently playing in theaters."). parameters (object): Defines the input parameters the function expects. type (string): Specifies the overall data type, such as object . properties (object): Lists individual parameters, each with: type (string): The data type of the parameter, such as string , integer , boolean, array . description (string): A description of the parameter's purpose and format. Provide examples and constraints ("The city and state, e.g., 'San Francisco, CA' or a zip code e.g., '95616'."). enum (array, optional): If the parameter values are from a fixed set, use "enum" to list the allowed values instead of just describing them in the description. This improves accuracy ("enum": ["daylight", "cool", "warm"]). required (array): An array of strings listing the parameter names that are mandatory for the function to operate. Function calling with thinking Enabling "thinking" can improve function call performance by allowing the model to reason through a request before suggesting function calls. However, because the Gemini API is stateless, this reasoning context is lost between turns, which can reduce the quality of function calls as they require multiple turn requests. To preserve this context you can use thought signatures. A thought signature is an encrypted representation of the model's internal thought process that you pass back to the model on subsequent turns. To use thought signatures: Receive the signature: When thinking is enabled, the API \ No newline at end of file diff --git a/docstore/4ca0bc5b-8708-41d6-8ad8-d60e585ffbe2 b/docstore/4ca0bc5b-8708-41d6-8ad8-d60e585ffbe2 new file mode 100644 index 0000000000000000000000000000000000000000..45c046a450410d0d7cea0863f584c81b40ede6bc --- /dev/null +++ b/docstore/4ca0bc5b-8708-41d6-8ad8-d60e585ffbe2 @@ -0,0 +1 @@ +Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' , system_instruction = 'you are a story teller for kids under 5 years old' , generation_config = genai . GenerationConfig ( max_output_tokens = 400 , top_k = 2 , top_p = 0.5 , temperature = 0.5 , response_mime_type = 'application/json' , stop_sequences = [ ' \n ' ], ) ) response = model . generate_content ( 'tell me a story in 100 words' ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" , generationConfig : { candidateCount : 1 , stopSequences : [ "x" ], maxOutputTokens : 20 , temperature : 1.0 , }, }); const result = await model . generateContent ( "Tell me a story about a magic backpack." , ); console . log ( result . response . text ()) Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( "gemini-1.5-flash" ) model . SetTemperature ( 0.5 ) model . SetTopP ( 0.5 ) model . SetTopK ( 2.0 ) model . SetMaxOutputTokens ( 100 ) model . ResponseMIMEType = "application/json" resp , err := model . GenerateContent ( ctx , genai . Text ( "Tell me about New York" )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) // utility for printing response After Python For all methods in the new SDK, the required arguments are provided as keyword arguments. All optional inputs are provided in the config argument. Config arguments can be specified as either Python dictionaries or Config classes in the google.genai.types namespace. For utility and uniformity, all definitions within the types module are pydantic classes. from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = \ No newline at end of file diff --git a/docstore/4ca8ae26-b0e9-4ad9-8b5d-79c568958897 b/docstore/4ca8ae26-b0e9-4ad9-8b5d-79c568958897 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/4ca8ae26-b0e9-4ad9-8b5d-79c568958897 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/4ccf6596-8851-49e9-9cb9-13380b167f38 b/docstore/4ccf6596-8851-49e9-9cb9-13380b167f38 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/4ccf6596-8851-49e9-9cb9-13380b167f38 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/4cd038f4-d4ba-4ac5-9891-da0d3b24e35a b/docstore/4cd038f4-d4ba-4ac5-9891-da0d3b24e35a new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/4cd038f4-d4ba-4ac5-9891-da0d3b24e35a @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/4cd0f6e8-2f3d-44c6-86a7-6252bd7fe3a4 b/docstore/4cd0f6e8-2f3d-44c6-86a7-6252bd7fe3a4 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/4cd0f6e8-2f3d-44c6-86a7-6252bd7fe3a4 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/4cda4a29-6cfb-4c32-b2cf-2880f5fced0f b/docstore/4cda4a29-6cfb-4c32-b2cf-2880f5fced0f new file mode 100644 index 0000000000000000000000000000000000000000..0df9fa1ffe43a4e5c5d9ea42d1e8c86bea89ef41 --- /dev/null +++ b/docstore/4cda4a29-6cfb-4c32-b2cf-2880f5fced0f @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/batch-mode#input-file Title: Batch Mode | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/4d0859a8-0f83-4acd-92f9-ea9bf5877146 b/docstore/4d0859a8-0f83-4acd-92f9-ea9bf5877146 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/4d0859a8-0f83-4acd-92f9-ea9bf5877146 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/4d0933c8-2a41-4a38-861d-849c05d5b4c2 b/docstore/4d0933c8-2a41-4a38-861d-849c05d5b4c2 new file mode 100644 index 0000000000000000000000000000000000000000..c085d8aece3abc99a010c5a69268bce2397f0e27 --- /dev/null +++ b/docstore/4d0933c8-2a41-4a38-861d-849c05d5b4c2 @@ -0,0 +1 @@ +100mm Macro lens Model: imagen-3.0-generate-002 Motion Use case Lens type Focal lengths Additional details Sports, wildlife (motion) Telephoto zoom 100-400mm Fast shutter speed, Action or movement tracking Using several keywords from the table, Imagen can generate the following motion images: Prompt: a winning touchdown, fast shutter speed, movement tracking Model: imagen-3.0-generate-002 Prompt: A deer running in the forest, fast shutter speed, movement tracking Model: imagen-3.0-generate-002 Wide-angle Use case Lens type Focal lengths Additional details Astronomical, landscape (wide-angle) Wide-angle 10-24mm Long exposure times, sharp focus, long exposure, smooth water or clouds Using several keywords from the table, Imagen can generate the following wide-angle images: Prompt: an expansive mountain range, landscape wide angle 10mm Model: imagen-3.0-generate-002 Prompt: a photo of the moon, astro photography, wide angle 10mm Model: imagen-3.0-generate-002 What's next Check out the Veo guide to learn how to generate videos with the Gemini API. To learn more about Gemini models, see Gemini models and Experimental models . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/4d1b78a5-bf79-4b2a-b40f-981ad426777d b/docstore/4d1b78a5-bf79-4b2a-b40f-981ad426777d new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/4d1b78a5-bf79-4b2a-b40f-981ad426777d @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/4d2bf027-0836-452b-a3fb-586ba9672fec b/docstore/4d2bf027-0836-452b-a3fb-586ba9672fec new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/4d2bf027-0836-452b-a3fb-586ba9672fec @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/4d3095e1-e99a-4f34-aef9-93f17451c92e b/docstore/4d3095e1-e99a-4f34-aef9-93f17451c92e new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/4d3095e1-e99a-4f34-aef9-93f17451c92e @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/4d3b8b4a-6145-4066-9700-dcf00f8b7325 b/docstore/4d3b8b4a-6145-4066-9700-dcf00f8b7325 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/4d3b8b4a-6145-4066-9700-dcf00f8b7325 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/4d4687cc-9c37-45a2-aecb-ba573a7f25c5 b/docstore/4d4687cc-9c37-45a2-aecb-ba573a7f25c5 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/4d4687cc-9c37-45a2-aecb-ba573a7f25c5 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/4d80cd0a-1330-473b-b16d-85785196726a b/docstore/4d80cd0a-1330-473b-b16d-85785196726a new file mode 100644 index 0000000000000000000000000000000000000000..eeaa745b8119787addf02809d9d1b660f835f8e5 --- /dev/null +++ b/docstore/4d80cd0a-1330-473b-b16d-85785196726a @@ -0,0 +1 @@ +Function calling with the Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Function calling with the Gemini API Function calling lets you connect models to external tools and APIs. Instead of generating text responses, the model determines when to call specific functions and provides the necessary parameters to execute real-world actions. This allows the model to act as a bridge between natural language and real-world actions and data. Function calling has 3 primary use cases: Augment Knowledge: Access information from external sources like databases, APIs, and knowledge bases. Extend Capabilities: Use external tools to perform computations and extend the limitations of the model, such as using a calculator or creating charts. Take Actions: Interact with external systems using APIs, such as scheduling appointments, creating invoices, sending emails, or controlling smart home devices. Get Weather Schedule Meeting Create Chart Python from google import genai from google.genai import types # Define the function declaration for the model weather_function = { "name" : "get_current_temperature" , "description" : "Gets the current temperature for a given location." , "parameters" : { "type" : "object" , "properties" : { "location" : { "type" : "string" , "description" : "The city name, e.g. San Francisco" , }, }, "required" : [ "location" ], }, } # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ weather_function ]) config = types . GenerateContentConfig ( tools = [ tools ]) # Send request with function declarations response = client . models . generate_content ( model = \ No newline at end of file diff --git a/docstore/4d88555f-4397-4d5c-b1fc-b4bb902f7b95 b/docstore/4d88555f-4397-4d5c-b1fc-b4bb902f7b95 new file mode 100644 index 0000000000000000000000000000000000000000..493af88892c8e385278cfd79a844747e2fb42cf5 --- /dev/null +++ b/docstore/4d88555f-4397-4d5c-b1fc-b4bb902f7b95 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/live Title: Get started with Live API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/4d9174fb-0e06-464e-aeee-acf56d0330f3 b/docstore/4d9174fb-0e06-464e-aeee-acf56d0330f3 new file mode 100644 index 0000000000000000000000000000000000000000..fc6b27785ffc2bef06b5f2067401216e3ff34ff1 --- /dev/null +++ b/docstore/4d9174fb-0e06-464e-aeee-acf56d0330f3 @@ -0,0 +1 @@ +"cheeseburger" was excluded from the output because it wasn't a part of the order. Constraints Specify any constraints on reading the prompt or generating a response. You can tell the model what to do and not to do. For example, you can specify a constraint in the prompt on how long you want a summary to be: Prompt: Summarize this text in one sentence: Text: A quantum computer exploits quantum mechanical phenomena to perform calculations exponentially faster than any modern traditional computer. At very tiny scales, physical matter acts as both particles and as waves, and quantum computing uses specialized hardware to leverage this behavior. The operating principles of quantum devices is beyond the scope of classical physics. When deployed at scale, quantum computers could be used in a wide variety of applications such as: in cybersecurity to break existing encryption methods while helping researchers create new ones, in meteorology to develop better weather forecasting etc. However, the current state of the art quantum computers are still largely experimental and impractical. Response: Exploiting quantum mechanical phenomena, quantum computers can perform calculations exponentially faster than traditional computers for potential applications like cybersecurity and meteorology, although they are currently largely experimental and impractical. (gemini-2.5-flash) Response format You can give instructions that specify the format of the response. For example, you can ask for the response to be formatted as a table, bulleted list, elevator pitch, keywords, sentence, or paragraph. The following system instruction tells the model to be more conversational in its response: System instruction All questions should be answered comprehensively with details, unless the user requests a concise response specifically. Prompt What is a smart way to make a business that sells DVD's in 2025? Response: Starting a business selling DVDs in 2025 presents significant challenges due to the \ No newline at end of file diff --git a/docstore/4d92e87b-9e63-4133-a547-23c66a094e64 b/docstore/4d92e87b-9e63-4133-a547-23c66a094e64 new file mode 100644 index 0000000000000000000000000000000000000000..203c1e44cd8be8f0144d13a11f43fb822930ab15 --- /dev/null +++ b/docstore/4d92e87b-9e63-4133-a547-23c66a094e64 @@ -0,0 +1 @@ +] // MCP Server }); const client = new Client ( { name : "example-client" , version : "1.0.0" } ); // Configure the client const ai = new GoogleGenAI ({}); // Initialize the connection between client and server await client . connect ( serverParams ); // Send request to the model with MCP tools const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : `What is the weather in London in ${ new Date (). toLocaleDateString () } ?` , config : { tools : [ mcpToTool ( client )], // uses the session, will automatically call the tool // Uncomment if you **don't** want the sdk to automatically call the tool // automaticFunctionCalling: { // disable: true, // }, }, }); console . log ( response . text ) // Close the connection await client . close (); Limitations with built-in MCP support Built-in MCP support is a experimental feature in our SDKs and has the following limitations: Only tools are supported, not resources nor prompts It is available for the Python and JavaScript/TypeScript SDK. Breaking changes might occur in future releases. Manual integration of MCP servers is always an option if these limit what you're building. Supported models This section lists models and their function calling capabilities. Experimental models are not included. You can find a comprehensive capabilities overview on the model overview page. Model Function Calling Parallel Function Calling Compositional Function Calling Gemini 2.5 Pro ✔️ ✔️ ✔️ Gemini 2.5 Flash ✔️ ✔️ ✔️ Gemini 2.5 Flash-Lite ✔️ ✔️ ✔️ Gemini 2.0 Flash ✔️ ✔️ ✔️ Gemini 2.0 Flash-Lite X X X Best practices Function and Parameter Descriptions: Be extremely clear and specific in your descriptions. The model relies on these to choose the correct function and provide appropriate arguments. Naming: Use descriptive function names (without spaces, periods, or dashes). Strong Typing: Use specific types (integer, string, enum) for parameters to reduce errors. If a parameter has a limited set of valid \ No newline at end of file diff --git a/docstore/4db583da-fc68-42b3-8f85-84cd2d4ca9ed b/docstore/4db583da-fc68-42b3-8f85-84cd2d4ca9ed new file mode 100644 index 0000000000000000000000000000000000000000..805d10be613a3c55006267c8f5d9ce17fbca450d --- /dev/null +++ b/docstore/4db583da-fc68-42b3-8f85-84cd2d4ca9ed @@ -0,0 +1 @@ +declaration can include the following parameters: name (string): A unique name for the function ( get_weather_forecast , send_email ). Use descriptive names without spaces or special characters (use underscores or camelCase). description (string): A clear and detailed explanation of the function's purpose and capabilities. This is crucial for the model to understand when to use the function. Be specific and provide examples if helpful ("Finds theaters based on location and optionally movie title which is currently playing in theaters."). parameters (object): Defines the input parameters the function expects. type (string): Specifies the overall data type, such as object . properties (object): Lists individual parameters, each with: type (string): The data type of the parameter, such as string , integer , boolean, array . description (string): A description of the parameter's purpose and format. Provide examples and constraints ("The city and state, e.g., 'San Francisco, CA' or a zip code e.g., '95616'."). enum (array, optional): If the parameter values are from a fixed set, use "enum" to list the allowed values instead of just describing them in the description. This improves accuracy ("enum": ["daylight", "cool", "warm"]). required (array): An array of strings listing the parameter names that are mandatory for the function to operate. Function calling with thinking Enabling "thinking" can improve function call performance by allowing the model to reason through a request before suggesting function calls. However, because the Gemini API is stateless, this reasoning context is lost between turns, which can reduce the quality of function calls as they require multiple turn requests. To preserve this context you can use thought signatures. A thought signature is an encrypted representation of the model's internal thought process that you pass back to the model on subsequent turns. To use thought signatures: Receive the signature: When thinking is enabled, the API \ No newline at end of file diff --git a/docstore/4dd4de10-f63b-4701-a4a3-5d710b213586 b/docstore/4dd4de10-f63b-4701-a4a3-5d710b213586 new file mode 100644 index 0000000000000000000000000000000000000000..987408a9d5fce99ac4990dd87d63b0d860b9553d --- /dev/null +++ b/docstore/4dd4de10-f63b-4701-a4a3-5d710b213586 @@ -0,0 +1 @@ +overwhelming dominance of digital streaming and on-demand services; a "smart" approach would therefore require a highly specialized niche, a strong value proposition beyond just the disc, and likely a focus on community or collector markets rather than mainstream consumption. Here's a breakdown of what a smart DVD business in 2025 would likely entail: 1. Extreme Niche Specialization: You cannot compete with streaming libraries. Your business must focus on what digital doesn't offer easily or at all. This could include: * Out-of-Print & Rare Titles: Sourcing films or TV shows that have never been released digitally or are currently unavailable on major streaming platforms. .... (gemini-2.5-flash) Format responses with the completion strategy The completion strategy can also help format the response. The following example prompts the model to create an essay outline: Prompt: Create an outline for an essay about hummingbirds. Response: Here is a possible outline for an essay about hummingbirds: I. Introduction A. Hook: Start with a captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). B. Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds. C. Thesis Statement: State the main point of the essay, which could be about the remarkable adaptations, behaviors, and ecological importance of hummingbirds. II. Appearance and Identification ..... (gemini-2.5-flash) The prompt didn't specify the format for the outline and the model chose a format for you. To get the model to return an outline in a specific format, you can add text that represents the start of the outline and let the model complete it based on the pattern that you initiated. Prompt: Create an outline for an essay about hummingbirds. I. Introduction * Response: Okay, here is an outline for an essay about hummingbirds, starting from the introduction you provided: I. Introduction * Hook: Start with a \ No newline at end of file diff --git a/docstore/4dd641f7-c7e0-47a0-a9ed-ffb466bb7efb b/docstore/4dd641f7-c7e0-47a0-a9ed-ffb466bb7efb new file mode 100644 index 0000000000000000000000000000000000000000..665a477ea8352b1598262b3124a473a18fa8289a --- /dev/null +++ b/docstore/4dd641f7-c7e0-47a0-a9ed-ffb466bb7efb @@ -0,0 +1 @@ +professional, detailed The following are a few examples of prompts without quality modifiers and the same prompt with quality modifiers. Prompt (no quality modifiers): a photo of a corn stalk Prompt (with quality modifiers): 4k HDR beautiful photo of a corn stalk taken by a professional photographer Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Aspect ratios Imagen image generation lets you set five distinct image aspect ratios. Square (1:1, default) - A standard square photo. Common uses for this aspect ratio include social media posts. Fullscreen (4:3) - This aspect ratio is commonly used in media or film. It is also the dimensions of most old (non-widescreen) TVs and medium format cameras. It captures more of the scene horizontally (compared to 1:1), making it a preferred aspect ratio for photography. Prompt: close up of a musician's fingers playing the piano, black and white film, vintage (4:3 aspect ratio) Prompt: A professional studio photo of french fries for a high end restaurant, in the style of a food magazine (4:3 aspect ratio) Portrait full screen (3:4) - This is the fullscreen aspect ratio rotated 90 degrees. This lets to capture more of the scene vertically compared to the 1:1 aspect ratio. Prompt: a woman hiking, close of her boots reflected in a puddle, large mountains in the background, in the style of an advertisement, dramatic angles (3:4 aspect ratio) Prompt: aerial shot of a river flowing up a mystical valley (3:4 aspect ratio) Widescreen (16:9) - This ratio has replaced 4:3 and is now the most common aspect ratio for TVs, monitors, and mobile phone screens (landscape). Use this aspect ratio when you want to capture more of the background (for example, scenic landscapes). Prompt: a man wearing all white clothing sitting on the beach, close up, golden hour lighting (16:9 aspect ratio) Portrait (9:16) - This ratio is widescreen but rotated. This a relatively new aspect ratio that has been \ No newline at end of file diff --git a/docstore/4e0cc9f2-1daf-4683-b275-b49951aa514e b/docstore/4e0cc9f2-1daf-4683-b275-b49951aa514e new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/4e0cc9f2-1daf-4683-b275-b49951aa514e @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/4e1ecb3d-86a7-4425-985f-bce1ae085abb b/docstore/4e1ecb3d-86a7-4425-985f-bce1ae085abb new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/4e1ecb3d-86a7-4425-985f-bce1ae085abb @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/4e207de0-e9e9-4c4a-bab9-9c872580230e b/docstore/4e207de0-e9e9-4c4a-bab9-9c872580230e new file mode 100644 index 0000000000000000000000000000000000000000..2e2abef829a364ac1b0e2e7d97629b4c7b341cf0 --- /dev/null +++ b/docstore/4e207de0-e9e9-4c4a-bab9-9c872580230e @@ -0,0 +1 @@ +the TTL defaults to 1 hour. The cost for caching depends on the input token size and how long you want the tokens to persist. This section assumes that you've installed a Gemini SDK (or have curl installed) and that you've configured an API key, as shown in the quickstart . Explicit caching using the OpenAI library If you're using an OpenAI library , you can enable explicit caching using the cached_content property on extra_body . When to use explicit caching Context caching is particularly well suited to scenarios where a substantial initial context is referenced repeatedly by shorter requests. Consider using context caching for use cases such as: Chatbots with extensive system instructions Repetitive analysis of lengthy video files Recurring queries against large document sets Frequent code repository analysis or bug fixing How explicit caching reduces costs Context caching is a paid feature designed to reduce overall operational costs. Billing is based on the following factors: Cache token count: The number of input tokens cached, billed at a reduced rate when included in subsequent prompts. Storage duration: The amount of time cached tokens are stored (TTL), billed based on the TTL duration of cached token count. There are no minimum or maximum bounds on the TTL. Other factors: Other charges apply, such as for non-cached input tokens and output tokens. For up-to-date pricing details, refer to the Gemini API pricing page . To learn how to count tokens, see the Token guide . Additional considerations Keep the following considerations in mind when using context caching: The minimum input token count for context caching is 1,024 for 2.5 Flash and 2,048 for 2.5 Pro. The maximum is the same as the maximum for the given model. (For more on counting tokens, see the Token guide ). The model doesn't make any distinction between cached tokens and regular input tokens. Cached content is a prefix to the prompt. There are no special rate or usage limits on context caching; \ No newline at end of file diff --git a/docstore/4e531a2a-7c04-462d-9540-bea9f6f56687 b/docstore/4e531a2a-7c04-462d-9540-bea9f6f56687 new file mode 100644 index 0000000000000000000000000000000000000000..65337d81cbf9fba76eb8d44ddc68611350b61de7 --- /dev/null +++ b/docstore/4e531a2a-7c04-462d-9540-bea9f6f56687 @@ -0,0 +1 @@ += lambda s : s . segment . end_index , reverse = True ) for support in sorted_supports : end_index = support . segment . end_index if support . grounding_chunk_indices : # Create citation string like [1](link1)[2](link2) citation_links = [] for i in support . grounding_chunk_indices : if i < len ( chunks ): uri = chunks [ i ] . web . uri citation_links . append ( f "[ { i + 1 } ]( { uri } )" ) citation_string = ", " . join ( citation_links ) text = text [: end_index ] + citation_string + text [ end_index :] return text # Assuming response with grounding metadata text_with_citations = add_citations ( response ) print ( text_with_citations ) JavaScript function addCitations ( response ) { let text = response . text ; const supports = response . candidates [ 0 ] ? . groundingMetadata ? . groundingSupports ; const chunks = response . candidates [ 0 ] ? . groundingMetadata ? . groundingChunks ; // Sort supports by end_index in descending order to avoid shifting issues when inserting. const sortedSupports = [... supports ]. sort ( ( a , b ) = > ( b . segment ? . endIndex ?? 0 ) - ( a . segment ? . endIndex ?? 0 ), ); for ( const support of sortedSupports ) { const endIndex = support . segment ? . endIndex ; if ( endIndex === undefined || ! support . groundingChunkIndices ? . length ) { continue ; } const citationLinks = support . groundingChunkIndices . map ( i = > { const uri = chunks [ i ] ? . web ? . uri ; if ( uri ) { return `[ ${ i + 1 } ]( ${ uri } )` ; } return null ; }) . filter ( Boolean ); if ( citationLinks . length > 0 ) { const citationString = citationLinks . join ( ", " ); text = text . slice ( 0 , endIndex ) + citationString + text . slice ( endIndex ); } } return text ; } const textWithCitations = addCitations ( response ); console . log ( textWithCitations ); The new response with inline citations will look like this: Spain won Euro 2024, defeating England 2-1 in the final.[1](https:/...), [2](https:/...), [4](https:/...), [5](https:/...) This victory \ No newline at end of file diff --git a/docstore/4e579ff3-29ba-456a-9c3d-545d084e2e2d b/docstore/4e579ff3-29ba-456a-9c3d-545d084e2e2d new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/4e579ff3-29ba-456a-9c3d-545d084e2e2d @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/4e8e0cfc-9a3b-434d-b8ec-bc48ea79d1a4 b/docstore/4e8e0cfc-9a3b-434d-b8ec-bc48ea79d1a4 new file mode 100644 index 0000000000000000000000000000000000000000..eb9021b0c62d8225572ffa88a8581263b0a8ad78 --- /dev/null +++ b/docstore/4e8e0cfc-9a3b-434d-b8ec-bc48ea79d1a4 @@ -0,0 +1 @@ +and their capabilities, visit the Models page. Best practices Prompting tips For basic text generation, a zero-shot prompt often suffices without needing examples, system instructions or specific formatting. For more tailored outputs: Use System instructions to guide the model. Provide few example inputs and outputs to guide the model. This is often referred to as few-shot prompting. Consult our prompt engineering guide for more tips. Structured output In some cases, you may need structured output, such as JSON. Refer to our structured output guide to learn how. What's next Try the Gemini API getting started Colab . Explore Gemini's image , video , audio and document understanding capabilities. Learn about multimodal file prompting strategies . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/4e9e20ec-a8e1-4e95-b795-7949aa148b1e b/docstore/4e9e20ec-a8e1-4e95-b795-7949aa148b1e new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/4e9e20ec-a8e1-4e95-b795-7949aa148b1e @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/4eaa6d0d-612b-49e9-ac1a-1f2c5703517e b/docstore/4eaa6d0d-612b-49e9-ac1a-1f2c5703517e new file mode 100644 index 0000000000000000000000000000000000000000..c25aa08b43110ad03ba57bdce48865495fdfb941 --- /dev/null +++ b/docstore/4eaa6d0d-612b-49e9-ac1a-1f2c5703517e @@ -0,0 +1 @@ +fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ], "generationConfig": { "thinkingConfig": { "thinkingBudget": 0 } } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } System instructions and other configurations You can guide the behavior of Gemini models with system instructions. To do so, pass a GenerateContentConfig object. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , config = types . GenerateContentConfig ( system_instruction = "You are a cat. Your name is Neko." ), contents = "Hello there" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Hello there" , config : { systemInstruction : "You are a cat. Your name is Neko." , }, }); console . log ( response . text ); } await main (); \ No newline at end of file diff --git a/docstore/4ec36010-3885-418d-a9e7-7530762398e7 b/docstore/4ec36010-3885-418d-a9e7-7530762398e7 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/4ec36010-3885-418d-a9e7-7530762398e7 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/4ec82457-2647-40c7-8bd9-7b8f4ef1091a b/docstore/4ec82457-2647-40c7-8bd9-7b8f4ef1091a new file mode 100644 index 0000000000000000000000000000000000000000..c3dd5ccab4f70d7a2a8a4ffb78dc41dee62c1a3f --- /dev/null +++ b/docstore/4ec82457-2647-40c7-8bd9-7b8f4ef1091a @@ -0,0 +1 @@ +with texts to make sense, use whatever order is most natural. Troubleshooting your multimodal prompt If the model is not drawing information from the relevant part of the image: Drop hints with which aspects of the image you want the prompt to draw information from. If the model output is too generic (not tailored enough to the image/video input): At the start of the prompt, try asking the model to describe the image(s) or video before providing the task instruction, or try asking the model to refer to what's in the image. To troubleshoot which part failed: Ask the model to describe the image, or ask the model to explain its reasoning, to gauge the model's initial understanding. If your prompt results in hallucinated content: Try dialing down the temperature setting or asking the model for shorter descriptions so that it's less likely to extrapolate additional details. Tuning the sampling parameters: Experiment with different temperature settings and top-k selections to adjust the model's creativity. Be specific in your instructions Prompts have the most success when they are clear and detailed. If you have a specific output in mind, it's better to include that requirement in the prompt to ensure you get the output you want. For this image of an airport board, asking the model to just "describe this image" could generate a general description. If you need the model to parse the time and city from the image, you can include that request directly in your prompt. Prompt Model response Describe this image. The image shows an airport arrivals and departures board. Updated prompt Improved response Parse the time and city from the airport board shown in this image into a list. 10:50 Moscow 11:05 Edinburgh 11:05 London 11:10 Bucharest 11:30 Kiev 11:35 Dublin 11:45 East Midlands 12:15 Sofia 12:30 London 12:30 Newcastle 12:40 St Petersburg 12:40 London 12:45 Manchester Add a few examples The Gemini model can accept multiple inputs which it can use as examples to understand \ No newline at end of file diff --git a/docstore/4ec906df-dd1b-4951-a7fd-ec157eafa8cf b/docstore/4ec906df-dd1b-4951-a7fd-ec157eafa8cf new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/4ec906df-dd1b-4951-a7fd-ec157eafa8cf @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/4ecb2258-1bfc-47c9-a270-e85d6234004d b/docstore/4ecb2258-1bfc-47c9-a270-e85d6234004d new file mode 100644 index 0000000000000000000000000000000000000000..2446800bd40f0c531de753980e82d2c182023a25 --- /dev/null +++ b/docstore/4ecb2258-1bfc-47c9-a270-e85d6234004d @@ -0,0 +1 @@ +how you'll trade off if a change leads to improvements for one metric to the detriment of another. Like with other performance engineering, you may want to focus on worst-case performance across your evaluation set rather than average performance. Adversarial testing involves proactively trying to break your application. The goal is to identify points of weakness so that you can take steps to remedy them as appropriate. Adversarial testing can take significant time/effort from evaluators with expertise in your application — but the more you do, the greater your chance of spotting problems, especially those occurring rarely or only after repeated runs of the application. Adversarial testing is a method for systematically evaluating an ML model with the intent of learning how it behaves when provided with malicious or inadvertently harmful input: An input may be malicious when the input is clearly designed to produce an unsafe or harmful output-- for example, asking a text generation model to generate a hateful rant about a particular religion. An input is inadvertently harmful when the input itself may be innocuous, but produces harmful output -- for example, asking a text generation model to describe a person of a particular ethnicity and receiving a racist output. What distinguishes an adversarial test from a standard evaluation is the composition of the data used for testing. For adversarial tests, select test data that is most likely to elicit problematic output from the model. This means probing the model's behavior for all the types of harms that are possible, including rare or unusual examples and edge-cases that are relevant to safety policies. It should also include diversity in the different dimensions of a sentence such as structure, meaning and length. You can refer to the Google's Responsible AI practices in fairness for more details on what to consider when building a test dataset. Advanced tips Use automated testing instead of the traditional method of \ No newline at end of file diff --git a/docstore/4ee956d1-93af-4068-ab26-e6609447c38d b/docstore/4ee956d1-93af-4068-ab26-e6609447c38d new file mode 100644 index 0000000000000000000000000000000000000000..8219f10e184a0891e4bb35822a37a2ddc4e20372 --- /dev/null +++ b/docstore/4ee956d1-93af-4068-ab26-e6609447c38d @@ -0,0 +1 @@ +"type": "STRING" }, "ingredients": { "type": "ARRAY", "items": { "type": "STRING" } } }, "propertyOrdering": ["recipeName", "ingredients"] } } } }' 2 > /dev/null | head The output might look like this: [ { "recipeName" : "Chocolate Chip Cookies" , "ingredients" : [ "1 cup (2 sticks) unsalted butter, softened" , "3/4 cup granulated sugar" , "3/4 cup packed brown sugar" , "1 teaspoon vanilla extract" , "2 large eggs" , "2 1/4 cups all-purpose flour" , "1 teaspoon baking soda" , "1 teaspoon salt" , "2 cups chocolate chips" ] }, ... ] Providing a schema in a text prompt Instead of configuring a schema, you can supply a schema as natural language or pseudo-code in a text prompt. This method is not recommended , because it might produce lower quality output, and because the model is not constrained to follow the schema. Warning: Don't provide a schema in a text prompt if you're configuring a responseSchema . This can produce unexpected or low quality results. Here's a generic example of a schema provided in a text prompt: List a few popular cookie recipes, and include the amounts of ingredients. Produce JSON matching this specification: Recipe = { "recipeName": string, "ingredients": array } Return: array Since the model gets the schema from text in the prompt, you might have some flexibility in how you represent the schema. But when you supply a schema inline like this, the model is not actually constrained to return JSON. For a more deterministic, higher quality response, configure a schema on the model, and don't duplicate the schema in the text prompt. Generating enum values In some cases you might want the model to choose a single option from a list of options. To implement this behavior, you can pass an enum in your schema. You can use an enum option anywhere you could use a string in the responseSchema , because an enum is an array of strings. Like a JSON schema, an enum lets you constrain model output to meet the requirements of your application. \ No newline at end of file diff --git a/docstore/4efde342-e799-4f71-99bf-55dd834b6ca5 b/docstore/4efde342-e799-4f71-99bf-55dd834b6ca5 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/4efde342-e799-4f71-99bf-55dd834b6ca5 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/4f07e52e-0f94-4a98-9db5-8b6a2a2c3c2e b/docstore/4f07e52e-0f94-4a98-9db5-8b6a2a2c3c2e new file mode 100644 index 0000000000000000000000000000000000000000..8d1e7e8a7a566917639b9515f69a698a37f27a8d --- /dev/null +++ b/docstore/4f07e52e-0f94-4a98-9db5-8b6a2a2c3c2e @@ -0,0 +1 @@ +to avoid. ⛔ Negative pattern: Example Don't end haikus with a question: Haiku are fun A short and simple poem Don't you enjoy them? ✅ Positive pattern: Example Always end haikus with an assertion: Haiku are fun A short and simple poem A joy to write Consistent formatting Make sure that the structure and formatting of few-shot examples are the same to avoid responses with undesired formats. One of the primary objectives of adding few-shot examples in prompts is to show the model the response format. Therefore, it is essential to ensure a consistent format across all examples, especially paying attention to XML tags, white spaces, newlines, and example splitters. Add context You can include instructions and information in a prompt that the model needs to solve a problem, instead of assuming that the model has all of the required information. This contextual information helps the model understand the constraints and details of what you're asking for it to do. The following example asks the model to give troubleshooting guidance for a router: Prompt: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Response: A yellow blinking light on your Google Wifi router usually indicates a problem with the internet connection. Here are some steps to troubleshoot and fix the issue: 1. Restart your router and modem: - Unplug your modem from the power outlet and wait for 30 seconds. - Plug your modem back in and wait for it to fully boot up. - Unplug your Google Wifi router and wait for 30 seconds. - Plug your Google Wifi router back in and wait for it to fully boot up. ..... (gemini-2.5-flash) The response looks like generic troubleshooting information that's not specific to the router or the status of the LED indicator lights. To customize the response for the specific router, you can add to the prompt the router's troubleshooting guide as context for it to refer to when providing a response. Prompt: Answer the question \ No newline at end of file diff --git a/docstore/4f13288a-4456-4766-972d-2ee800bca10f b/docstore/4f13288a-4456-4766-972d-2ee800bca10f new file mode 100644 index 0000000000000000000000000000000000000000..8ae6e7f8faf81825a858a0fb6935f6e2a3e8dc09 --- /dev/null +++ b/docstore/4f13288a-4456-4766-972d-2ee800bca10f @@ -0,0 +1 @@ +help with that, as I'm only a language model." If the model responds with a fallback response, try increasing the temperature. Things to avoid Avoid relying on models to generate factual information. Use with care on math and logic problems. Generative models under the hood This section aims to answer the question - Is there randomness in generative models' responses, or are they deterministic? The short answer - yes to both. When you prompt a generative model, a text response is generated in two stages. In the first stage, the generative model processes the input prompt and generates a probability distribution over possible tokens (words) that are likely to come next. For example, if you prompt with the input text "The dog jumped over the ... ", the generative model will produce an array of probable next words: [("fence", 0.77), ("ledge", 0.12), ("blanket", 0.03), ...] This process is deterministic; a generative model will produce this same distribution every time it's input the same prompt text. In the second stage, the generative model converts these distributions into actual text responses through one of several decoding strategies. A simple decoding strategy might select the most likely token at every timestep. This process would always be deterministic. However, you could instead choose to generate a response by randomly sampling over the distribution returned by the model. This process would be stochastic (random). Control the degree of randomness allowed in this decoding process by setting the temperature. A temperature of 0 means only the most likely tokens are selected, and there's no randomness. Conversely, a high temperature injects a high degree of randomness into the tokens selected by the model, leading to more unexpected, surprising model responses. Next steps Now that you have a deeper understanding of prompt design, try writing your own prompts using Google AI Studio . To learn about multimodal prompting, see Prompting with media files . To learn \ No newline at end of file diff --git a/docstore/4f38d850-ff53-401f-b954-95cf83d37890 b/docstore/4f38d850-ff53-401f-b954-95cf83d37890 new file mode 100644 index 0000000000000000000000000000000000000000..805d10be613a3c55006267c8f5d9ce17fbca450d --- /dev/null +++ b/docstore/4f38d850-ff53-401f-b954-95cf83d37890 @@ -0,0 +1 @@ +declaration can include the following parameters: name (string): A unique name for the function ( get_weather_forecast , send_email ). Use descriptive names without spaces or special characters (use underscores or camelCase). description (string): A clear and detailed explanation of the function's purpose and capabilities. This is crucial for the model to understand when to use the function. Be specific and provide examples if helpful ("Finds theaters based on location and optionally movie title which is currently playing in theaters."). parameters (object): Defines the input parameters the function expects. type (string): Specifies the overall data type, such as object . properties (object): Lists individual parameters, each with: type (string): The data type of the parameter, such as string , integer , boolean, array . description (string): A description of the parameter's purpose and format. Provide examples and constraints ("The city and state, e.g., 'San Francisco, CA' or a zip code e.g., '95616'."). enum (array, optional): If the parameter values are from a fixed set, use "enum" to list the allowed values instead of just describing them in the description. This improves accuracy ("enum": ["daylight", "cool", "warm"]). required (array): An array of strings listing the parameter names that are mandatory for the function to operate. Function calling with thinking Enabling "thinking" can improve function call performance by allowing the model to reason through a request before suggesting function calls. However, because the Gemini API is stateless, this reasoning context is lost between turns, which can reduce the quality of function calls as they require multiple turn requests. To preserve this context you can use thought signatures. A thought signature is an encrypted representation of the model's internal thought process that you pass back to the model on subsequent turns. To use thought signatures: Receive the signature: When thinking is enabled, the API \ No newline at end of file diff --git a/docstore/4f4d40c9-d680-4367-89c4-6610b5ef520c b/docstore/4f4d40c9-d680-4367-89c4-6610b5ef520c new file mode 100644 index 0000000000000000000000000000000000000000..39af00a0c9a9d748f5b17ca6db1ebff5b823541e --- /dev/null +++ b/docstore/4f4d40c9-d680-4367-89c4-6610b5ef520c @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " Delete uploaded files Files are automatically deleted after 48 hours. You can also manually delete an uploaded file: Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) client . files . delete ( name = myfile . name ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; await ai . files . delete ({ name : fileName }); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } client . DeleteFile ( ctx , file . Name ) REST curl --request "DELETE" https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " Usage info You can use the Files API to upload and interact with media files. The Files API lets you store up to 20 GB of files per project, with a per-file maximum size of 2 GB. Files are stored for 48 hours. During that time, you can use the API to get metadata about the files, but you can't download the files. The Files API is available at no cost in all regions where the Gemini API is available. File prompting strategies This section provides guidance and best practices for using media files with prompts for the Gemini API. Being able to use various types of data in your prompts gives you more flexibility in terms of what tasks you can tackle with the Gemini API. For example, you can send the model a photo of a delicious meal and ask it to write a short blog about the meal. Prompt Response Write a short, engaging blog post based on this picture. It should include a description of the meal in the photo and talk about my journey meal prepping. Meal prepping is a great way to save time and money, and it can also help you to eat healthier. This meal is a great example of a healthy and delicious meal that can be easily prepped ahead of time. This \ No newline at end of file diff --git a/docstore/4f5225a7-4a92-4361-87b3-eb41ca7ca2e1 b/docstore/4f5225a7-4a92-4361-87b3-eb41ca7ca2e1 new file mode 100644 index 0000000000000000000000000000000000000000..1b8db702d1402c9f81138189f71a133485ad1327 --- /dev/null +++ b/docstore/4f5225a7-4a92-4361-87b3-eb41ca7ca2e1 @@ -0,0 +1 @@ +URL context | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback URL context Experimental: The URL context tool is an experimental feature. Using the URL context tool, you can provide Gemini with URLs as additional context for your prompt. The model can then retrieve content from the URLs and use that content to inform and shape its response. This tool is useful for tasks like the following: Extracting key data points or talking points from articles Comparing information across multiple links Synthesizing data from several sources Answering questions based on the content of a specific page or pages Analyzing content for specific purposes (like writing a job description or creating test questions) This guide explains how to use the URL context tool in the Gemini API. Use URL context You can use the URL context tool in two main ways, by itself or in conjunction with Grounding with Google Search . URL Context Only You provide specific URLs that you want the model to analyze directly in your prompt. Example prompts: Summarize this document: YOUR_URLs Extract the key features from the product description on this page: YOUR_URLs Grounding with Google Search + URL Context You can also enable both URL context and Grounding with Google Search together. You can enter a prompt with or without URLs. The model may first search for relevant information and then use the URL context tool to read the content of the search results for a more in-depth understanding. Example prompts: Give me three day events schedule based on YOUR_URL . Also let me know what needs to taken care of considering weather and commute. Recommend 3 books for beginners to read to \ No newline at end of file diff --git a/docstore/4f590e39-1781-448e-9c88-719de392a81b b/docstore/4f590e39-1781-448e-9c88-719de392a81b new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/4f590e39-1781-448e-9c88-719de392a81b @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/4f635809-5820-4444-9b2b-efba8c5ce671 b/docstore/4f635809-5820-4444-9b2b-efba8c5ce671 new file mode 100644 index 0000000000000000000000000000000000000000..b56dfbca2298dddb94eb66ba2a6b5e2a4d7109fa --- /dev/null +++ b/docstore/4f635809-5820-4444-9b2b-efba8c5ce671 @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "' " ${ MIME1_TYPE } " '", "file_uri": ' $file1_uri '}}, { "inline_data": { "mime_type":"' " ${ MIME2_TYPE } " '", "data": "' " $IMAGE2_BASE64 " '" } } ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Object detection From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. Python from google import genai from google.genai import types from PIL import Image import json client = genai . Client () prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image . open ( "/path/to/image.png" ) config = types . GenerateContentConfig ( response_mime_type = "application/json" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , prompt ], config = config ) width , height = image . size bounding_boxes = json . loads ( response . text ) converted_bounding_boxes = [] for bounding_box in bounding_boxes : abs_y1 = int ( bounding_box [ "box_2d" ][ 0 ] / 1000 * height ) abs_x1 = int ( bounding_box [ "box_2d" ][ 1 ] / 1000 * width ) abs_y2 = int ( bounding_box [ "box_2d" ][ 2 ] / 1000 * height ) abs_x2 = int ( bounding_box [ "box_2d" ][ 3 ] / 1000 * width ) converted_bounding_boxes . append ([ abs_x1 , abs_y1 , abs_x2 , abs_y2 ]) print ( "Image size: " , width , height ) print ( "Bounding boxes:" , converted_bounding_boxes ) Note: The model also supports generating bounding boxes based on custom \ No newline at end of file diff --git a/docstore/4f81161a-2001-41a9-8137-01c3cfe64e03 b/docstore/4f81161a-2001-41a9-8137-01c3cfe64e03 new file mode 100644 index 0000000000000000000000000000000000000000..8f7945b0bd22308457df570e5259c405fbd173ad --- /dev/null +++ b/docstore/4f81161a-2001-41a9-8137-01c3cfe64e03 @@ -0,0 +1 @@ +and 100 is full brightness' , }, color_temp : { type : Type . STRING , enum : [ 'daylight' , 'cool' , 'warm' ], description : 'Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`.' , }, }, required : [ 'brightness' , 'color_temp' ], }, }; /** * Set the brightness and color temperature of a room light. (mock API) * @param {number} brightness - Light level from 0 to 100. Zero is off and 100 is full brightness * @param {string} color_temp - Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. * @return {Object} A dictionary containing the set brightness and color temperature. */ function setLightValues ( brightness , color_temp ) { return { brightness : brightness , colorTemperature : color_temp }; } Step 2: Call the model with function declarations Once you have defined your function declarations, you can prompt the model to use them. It analyzes the prompt and function declarations and decides whether to respond directly or to call a function. If a function is called, the response object will contain a function call suggestion. Python from google.genai import types # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ set_light_values_declaration ]) config = types . GenerateContentConfig ( tools = [ tools ]) # Define user prompt contents = [ types . Content ( role = "user" , parts = [ types . Part ( text = "Turn the lights down to a romantic level" )] ) ] # Send request with function declarations response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = contents config = config , ) print ( response . candidates [ 0 ] . content . parts [ 0 ] . function_call ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Generation config with function declaration const config = { tools : [{ functionDeclarations : [ setLightValuesFunctionDeclaration ] }] }; // Configure the client const ai = new GoogleGenAI ({}); // Define user \ No newline at end of file diff --git a/docstore/4f8287e3-5ba4-4d14-80ff-397a20df63ff b/docstore/4f8287e3-5ba4-4d14-80ff-397a20df63ff new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/4f8287e3-5ba4-4d14-80ff-397a20df63ff @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/4f9a3b69-03c6-4368-bc51-e14c93b5530e b/docstore/4f9a3b69-03c6-4368-bc51-e14c93b5530e new file mode 100644 index 0000000000000000000000000000000000000000..a5c7df71403cc48e8e56352e55ad417999aabca3 --- /dev/null +++ b/docstore/4f9a3b69-03c6-4368-bc51-e14c93b5530e @@ -0,0 +1 @@ +. getGenerativeModel ({ model : "gemini-1.5-flash" , safetySettings : [ { category : HarmCategory . HARM_CATEGORY_HARASSMENT , threshold : HarmBlockThreshold . BLOCK_LOW_AND_ABOVE , }, ], }); const unsafePrompt = "I support Martians Soccer Club and I think " + "Jupiterians Football Club sucks! Write an ironic phrase telling " + "them how I feel about them." ; const result = await model . generateContent ( unsafePrompt ); try { result . response . text (); } catch ( e ) { console . error ( e ); console . log ( result . response . candidates [ 0 ]. safetyRatings ); } After Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'say something bad' , config = types . GenerateContentConfig ( safety_settings = [ types . SafetySetting ( category = 'HARM_CATEGORY_HATE_SPEECH' , threshold = 'BLOCK_ONLY_HIGH' ), ] ), ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const unsafePrompt = "I support Martians Soccer Club and I think " + "Jupiterians Football Club sucks! Write an ironic phrase telling " + "them how I feel about them." ; const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : unsafePrompt , config : { safetySettings : [ { category : "HARM_CATEGORY_HARASSMENT" , threshold : "BLOCK_ONLY_HIGH" , }, ], }, }); console . log ( "Finish reason:" , response . candidates [ 0 ]. finishReason ); console . log ( "Safety ratings:" , response . candidates [ 0 ]. safetyRatings ); Async Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content_async ( 'tell me a story in 100 words' ) After Python To use the new SDK with asyncio , there is a separate async implementation of every method under client.aio . from google import genai client = genai . Client () response = await \ No newline at end of file diff --git a/docstore/4f9d7c7a-615e-483e-8dca-8ef3c616bc6e b/docstore/4f9d7c7a-615e-483e-8dca-8ef3c616bc6e new file mode 100644 index 0000000000000000000000000000000000000000..933572d214295ba3502f9da18dc8703a97691ed7 --- /dev/null +++ b/docstore/4f9d7c7a-615e-483e-8dca-8ef3c616bc6e @@ -0,0 +1 @@ +captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). * Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds, and their general characteristics. * Thesis Statement: State the main point of the essay, focusing on the remarkable adaptations, behaviors, and ecological significance that make hummingbirds extraordinary. ..... (gemini-2.5-flash) Zero-shot vs few-shot prompts You can include examples in the prompt that show the model what getting it right looks like. The model attempts to identify patterns and relationships from the examples and applies them when generating a response. Prompts that contain a few examples are called few-shot prompts, while prompts that provide no examples are called zero-shot prompts. Few-shot prompts are often used to regulate the formatting, phrasing, scoping, or general patterning of model responses. Use specific and varied examples to help the model narrow its focus and generate more accurate results. We recommend to always include few-shot examples in your prompts. Prompts without few-shot examples are likely to be less effective. In fact, you can remove instructions from your prompt if your examples are clear enough in showing the task at hand. The following zero-shot prompt asks the model to choose the best explanation. Prompt: Please choose the best explanation to the question: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Explanation1 is the better explanation because it provides more detail on the process, including how ice crystals combine and grow into snowflakes as they fall through the atmosphere. (gemini-2.5-flash) If your use case requires the \ No newline at end of file diff --git a/docstore/4fad2da1-afac-413f-a13d-61e920ea9cd0 b/docstore/4fad2da1-afac-413f-a13d-61e920ea9cd0 new file mode 100644 index 0000000000000000000000000000000000000000..6ba79c2c7031e707d89e2b23470a5d9ee375cc5c --- /dev/null +++ b/docstore/4fad2da1-afac-413f-a13d-61e920ea9cd0 @@ -0,0 +1 @@ +mcp.client.stdio import stdio_client from google import genai client = genai . Client () # Create server parameters for stdio connection server_params = StdioServerParameters ( command = "npx" , # Executable args = [ "-y" , "@philschmid/weather-mcp" ], # MCP Server env = None , # Optional environment variables ) async def run (): async with stdio_client ( server_params ) as ( read , write ): async with ClientSession ( read , write ) as session : # Prompt to get the weather for the current day in London. prompt = f "What is the weather in London in { datetime . now () . strftime ( '%Y-%m- %d ' ) } ?" # Initialize the connection between client and server await session . initialize () # Send request to the model with MCP function declarations response = await client . aio . models . generate_content ( model = "gemini-2.5-flash" , contents = prompt , config = genai . types . GenerateContentConfig ( temperature = 0 , tools = [ session ], # uses the session, will automatically call the tool # Uncomment if you **don't** want the SDK to automatically call the tool # automatic_function_calling=genai.types.AutomaticFunctionCallingConfig( # disable=True # ), ), ) print ( response . text ) # Start the asyncio event loop and run the main function asyncio . run ( run ()) JavaScript Make sure the latest version of the mcp SDK is installed on your platform of choice. npm install @modelcontextprotocol/sdk Note: JavaScript supports automatic tool calling by wrapping the client with mcpToTool . If you want to disable it, you can provide automaticFunctionCalling with disabled true . import { GoogleGenAI , FunctionCallingConfigMode , mcpToTool } from '@google/genai' ; import { Client } from "@modelcontextprotocol/sdk/client/index.js" ; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js" ; // Create server parameters for stdio connection const serverParams = new StdioClientTransport ({ command : "npx" , // Executable args : [ "-y" , "@philschmid/weather-mcp" \ No newline at end of file diff --git a/docstore/4fbe1490-b14b-4df8-a557-fbb95db60a65 b/docstore/4fbe1490-b14b-4df8-a557-fbb95db60a65 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/4fbe1490-b14b-4df8-a557-fbb95db60a65 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/4ff1d3f6-1451-4d65-a7b3-dde2abc2c02b b/docstore/4ff1d3f6-1451-4d65-a7b3-dde2abc2c02b new file mode 100644 index 0000000000000000000000000000000000000000..9bdceff10febb2c20e655c9c51c8df9e3aef858d --- /dev/null +++ b/docstore/4ff1d3f6-1451-4d65-a7b3-dde2abc2c02b @@ -0,0 +1 @@ +Using different words or phrasing in your prompts often yields different responses from the model even though they all mean the same thing. If you're not getting the expected results from your prompt, try rephrasing it. Version 1: How do I bake a pie? Version 2: Suggest a recipe for a pie. Version 3: What's a good pie recipe? Switch to an analogous task: If you can't get the model to follow your instructions for a task, try giving it instructions for an analogous task that achieves the same result. This prompt tells the model to categorize a book by using predefined categories: Prompt: Which category does The Odyssey belong to: thriller sci-fi mythology biography Response: The Odyssey belongs to the category of **mythology**. Here's why: * **Mythology:** The Odyssey tells the story of Odysseus, a hero from Greek mythology, and his journey home after the Trojan War. It features gods, monsters, and supernatural events common to Greek mythology. ..... (gemini-2.5-flash) The response is correct, but the model didn't stay within the bounds of the options. You also want to model to just respond with one of the options instead of in a full sentence. In this case, you can rephrase the instructions as a multiple choice question and ask the model to choose an option. Prompt: Multiple choice problem: Which of the following options describes the book The Odyssey? Options: thriller sci-fi mythology biography Response: The correct answer is mythology . (gemini-2.5-flash) Change the order of prompt content: The order of the content in the prompt can sometimes affect the response. Try changing the content order and see how that affects the response. Version 1 : [ examples ] [ context ] [ input ] Version 2 : [ input ] [ examples ] [ context ] Version 3 : [ examples ] [ input ] [ context ] Fallback responses A fallback response is a response returned by the model when either the prompt or the response triggers a safety filter. An example of a fallback response is "I'm not able to \ No newline at end of file diff --git a/docstore/4ff37526-5480-4ba4-bfa2-4835730722e8 b/docstore/4ff37526-5480-4ba4-bfa2-4835730722e8 new file mode 100644 index 0000000000000000000000000000000000000000..d028e190ded4732dff341fe647fe7532c210272b --- /dev/null +++ b/docstore/4ff37526-5480-4ba4-bfa2-4835730722e8 @@ -0,0 +1 @@ +second image shows an empty hallway in a shopping mall, which is a place where people often go to buy things for their trips. The third image shows a stack of toilet paper, which is something that people often use when they are traveling. The fourth image shows a dark and stormy sea, which is something that people often encounter when they are traveling. The same tips work for creative output from images. In the following example with the rabbit, explicitly asking the model to include aspects from the image can improve the output. Prompt Model response Can you write me a poem about this image? Little bunny, so soft and brown, Hopping through the field, up and down. Your ears so long, your tail so small, You're the cutest creature of them all. You nibble on the grass, so green and sweet, And hop away when you see me. But I'll always be your friend, Little bunny, until the very end. Updated prompt Improved response Can you write me a descriptive and dramatic poem about this image and include the location? Little bunny on the green, Ears so long, so keen. What do you see out there, In the distance, where you stare? Is it the ocean blue, Sparkling in the sun's hue? Or the waves that crash and roar, A rhythm that you adore? Little bunny, tell me true, What captures your attention so? Is it the salty breeze, Or the seagulls' cries that please? Perhaps it's the vastness of it all, A world so big and small. A place where dreams can roam, And adventures can be sown. Little bunny, I wonder what you dream, As you sit on the grass, so serene. Do you yearn to explore the deep, Or stay on land, where you can leap? Whatever it may be, little bunny, Keep that spark of wonder burning brightly. For in your dreams and aspirations, Lies a world waiting for your creation. Troubleshooting which part of the prompt failed It can be hard to know whether a prompt failed because the model didn't understand the image to begin with, or if it did understand the image but did not perform the \ No newline at end of file diff --git a/docstore/5000c4d0-172f-46a5-8a6f-8e11da79e787 b/docstore/5000c4d0-172f-46a5-8a6f-8e11da79e787 new file mode 100644 index 0000000000000000000000000000000000000000..18b4c9f67696057e7a072a89e87be8efb7e767eb --- /dev/null +++ b/docstore/5000c4d0-172f-46a5-8a6f-8e11da79e787 @@ -0,0 +1 @@ +So for instance, instead of creating an application to write an email reply from scratch, you might instead limit it to expanding on an outline or suggesting alternative phrasings. Perform safety testing appropriate to your use case Testing is a key part of building robust and safe applications, but the extent, scope and strategies for testing will vary. For example, a just-for-fun haiku generator is likely to pose less severe risks than, say, an application designed for use by law firms to summarize legal documents and help draft contracts. But the haiku generator may be used by a wider variety of users which means the potential for adversarial attempts or even unintended harmful inputs can be greater. The implementation context also matters. For instance, an application with outputs that are reviewed by human experts prior to any action being taken might be deemed less likely to produce harmful outputs than the identical application without such oversight. It's not uncommon to go through several iterations of making changes and testing before feeling confident that you're ready to launch, even for applications that are relatively low risk. Two kinds of testing are particularly useful for AI applications: Safety benchmarking involves designing safety metrics that reflect the ways your application could be unsafe in the context of how it is likely to get used, then testing how well your application performs on the metrics using evaluation datasets. It's good practice to think about the minimum acceptable levels of safety metrics before testing so that 1) you can evaluate the test results against those expectations and 2) you can gather the evaluation dataset based on the tests that evaluate the metrics you care about most. Advanced tips Beware of over-relying on “off the shelf” approaches as it's likely you'll need to build your own testing datasets using human raters to fully suit your application's context. If you have more than one metric you'll need to decide \ No newline at end of file diff --git a/docstore/5020b96d-38be-4fd4-a70a-208a5e86163d b/docstore/5020b96d-38be-4fd4-a70a-208a5e86163d new file mode 100644 index 0000000000000000000000000000000000000000..8639c839707c6139e726ba0ddc89bfd42831d7c1 --- /dev/null +++ b/docstore/5020b96d-38be-4fd4-a70a-208a5e86163d @@ -0,0 +1 @@ +POST \ -d '{ "contents": [{ "parts":[ {"file_data": {"mime_type": "application/pdf", "file_uri": ' $file_uri_1 '}}, {"file_data": {"mime_type": "application/pdf", "file_uri": ' $file_uri_2 '}}, {"text": "' $PROMPT '"} ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Technical details Gemini supports a maximum of 1,000 document pages. Each document page is equivalent to 258 tokens. While there are no specific limits to the number of pixels in a document besides the model's context window , larger pages are scaled down to a maximum resolution of 3072x3072 while preserving their original aspect ratio, while smaller pages are scaled up to 768x768 pixels. There is no cost reduction for pages at lower sizes, other than bandwidth, or performance improvement for pages at higher resolution. Document types Technically, you can pass other MIME types for document understanding, like TXT, Markdown, HTML, XML, etc. However, document vision only meaningfully understands PDFs . Other types will be extracted as pure text, and the model won't be able to interpret what we see in the rendering of those files. Any file-type specifics like charts, diagrams, HTML tags, Markdown formatting, etc., will be lost. Best practices For best results: Rotate pages to the correct orientation before uploading. Avoid blurry pages. If using a single page, place the text prompt after the page. What's next To learn more, see the following resources: File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers \ No newline at end of file diff --git a/docstore/5062331f-8d73-49d9-b698-dad6c00823b4 b/docstore/5062331f-8d73-49d9-b698-dad6c00823b4 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/5062331f-8d73-49d9-b698-dad6c00823b4 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/5064ca1a-596b-4988-b300-5c714f464d47 b/docstore/5064ca1a-596b-4988-b300-5c714f464d47 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/5064ca1a-596b-4988-b300-5c714f464d47 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/50c78373-6f51-4069-a6be-81f722ab0554 b/docstore/50c78373-6f51-4069-a6be-81f722ab0554 new file mode 100644 index 0000000000000000000000000000000000000000..63d9f8f1c5d6f296f9334ea760bebfcc6dc4a24c --- /dev/null +++ b/docstore/50c78373-6f51-4069-a6be-81f722ab0554 @@ -0,0 +1 @@ +Gemini thinking | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini thinking The Gemini 2.5 series models use an internal "thinking process" that significantly improves their reasoning and multi-step planning abilities, making them highly effective for complex tasks such as coding, advanced mathematics, and data analysis. This guide shows you how to work with Gemini's thinking capabilities using the Gemini API. Before you begin Ensure you use a supported 2.5 series model for thinking. You might find it beneficial to explore these models in AI Studio before diving into the API: Try Gemini 2.5 Flash in AI Studio Try Gemini 2.5 Pro in AI Studio Try Gemini 2.5 Flash-Lite Preview in AI Studio Generating content with thinking Initiating a request with a thinking model is similar to any other content generation request. The key difference lies in specifying one of the models with thinking support in the model field, as demonstrated in the following text generation example: Python from google import genai client = genai . Client () prompt = "Explain the concept of Occam's Razor and provide a simple, everyday example." response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = prompt ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const prompt = "Explain the concept of Occam's Razor and provide a simple, everyday example." ; const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : prompt , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" \ No newline at end of file diff --git a/docstore/50daf6dc-4e10-4d90-b63f-e4ddd8de53fc b/docstore/50daf6dc-4e10-4d90-b63f-e4ddd8de53fc new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/50daf6dc-4e10-4d90-b63f-e4ddd8de53fc @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/50e8935d-ad12-42cc-9ee6-713be77f7e11 b/docstore/50e8935d-ad12-42cc-9ee6-713be77f7e11 new file mode 100644 index 0000000000000000000000000000000000000000..2c0be12e5de5e086f593ed916f2fdecc774a45a4 --- /dev/null +++ b/docstore/50e8935d-ad12-42cc-9ee6-713be77f7e11 @@ -0,0 +1 @@ +Text generation | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Text generation The Gemini API can generate text output from various inputs, including text, images, video, and audio, leveraging Gemini models. Here's a basic example that takes a single text input: Python from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents \ No newline at end of file diff --git a/docstore/50ef2489-c84f-478a-8152-55c55be259d2 b/docstore/50ef2489-c84f-478a-8152-55c55be259d2 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/50ef2489-c84f-478a-8152-55c55be259d2 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/50ff32f1-1d01-4517-8854-4183e5434d80 b/docstore/50ff32f1-1d01-4517-8854-4183e5434d80 new file mode 100644 index 0000000000000000000000000000000000000000..12eaa70edcc32ba40cc62f56233fa0adea94b474 --- /dev/null +++ b/docstore/50ff32f1-1d01-4517-8854-4183e5434d80 @@ -0,0 +1 @@ +Gemini API libraries | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini API libraries When building with the Gemini API, we recommend using our official collection of libraries across major languages: the Google GenAI SDK . They are production ready under General Availability . Our samples and documentation across this site are built using these libraries. Note: If you're using one of our legacy libraries, we strongly recommend you migrate to the Google GenAI SDK. Review the legacy libraries section for more information. If you're new to the Gemini API, follow our quickstart guide to get started. Language support and installation The Google GenAI SDK is available for the Python, JavaScript/TypeScript, Go and Java languages. You can install each language's library using package managers, or visit their GitHub repos for further engagement: Python Library: google-genai GitHub Repository: googleapis/python-genai Installation: pip install google-genai JavaScript Library: @google/genai GitHub Repository: googleapis/js-genai Installation: npm install @google/genai Go Library: google.golang.org/genai GitHub Repository: googleapis/go-genai Installation: go get google.golang.org/genai Java Library: google-genai GitHub Repository: googleapis/java-genai Installation: If you're using Maven, add the following to your dependencies: com.google.genai google-genai 1.0.0 General availability We started rolling out the Google GenAI SDK in late 2024. As of May 2025, it reached General Availability (GA) across all supported platforms. This means \ No newline at end of file diff --git a/docstore/51102f9f-d827-44d8-9d4d-61cd6cfbade0 b/docstore/51102f9f-d827-44d8-9d4d-61cd6cfbade0 new file mode 100644 index 0000000000000000000000000000000000000000..9dd2717ee06a5cb6666e2976e86c3492640fe1f5 --- /dev/null +++ b/docstore/51102f9f-d827-44d8-9d4d-61cd6cfbade0 @@ -0,0 +1 @@ +'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Refer to the GenerateContentConfig in our API reference for a complete list of configurable parameters and their descriptions. Multimodal inputs The Gemini API supports multimodal inputs, allowing you to combine text with media files. The following example demonstrates providing an image: Python from PIL import Image from google import genai client = genai . Client () image = Image . open ( "/path/to/organ.png" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , "Tell me about this instrument" ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const image = await ai . files . upload ({ file : "/path/to/organ.png" , }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ createUserContent ([ "Tell me about this instrument" , createPartFromUri ( image . uri , image . mimeType ), ]), ], }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imagePath := "/path/to/organ.jpg" imgData , _ := os . ReadFile ( imagePath ) parts := [] * genai . Part { genai . NewPartFromText ( "Tell me about this instrument" ), & genai . Part { InlineData : & genai . Blob { MIMEType : "image/jpeg" , Data : imgData , \ No newline at end of file diff --git a/docstore/51568b3c-ade2-470a-8b70-0c750e673135 b/docstore/51568b3c-ade2-470a-8b70-0c750e673135 new file mode 100644 index 0000000000000000000000000000000000000000..2e8ac78d46cd543ec267658a40dc30e0a2feb077 --- /dev/null +++ b/docstore/51568b3c-ade2-470a-8b70-0c750e673135 @@ -0,0 +1 @@ +Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { SystemInstruction : genai . NewContentFromText ( "You are a cat. Your name is Neko." , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Hello there" ), config , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "system_instruction": { "parts": [ { "text": "You are a cat. Your name is Neko." } ] }, "contents": [ { "parts": [ { "text": "Hello there" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const systemInstruction = { parts : [{ text : 'You are a cat. Your name is Neko.' }] }; const payload = { systemInstruction , contents : [ { parts : [ { text : 'Hello there' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } The GenerateContentConfig object also lets you override default generation parameters, such as temperature . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model \ No newline at end of file diff --git a/docstore/5164ec97-d916-4bdf-9be7-226d8d8008a6 b/docstore/5164ec97-d916-4bdf-9be7-226d8d8008a6 new file mode 100644 index 0000000000000000000000000000000000000000..a93f111d87cca3d0226fc4cdf96775703aa03950 --- /dev/null +++ b/docstore/5164ec97-d916-4bdf-9be7-226d8d8008a6 @@ -0,0 +1 @@ +genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMAGE_PATH = "path/to/sample.jpg" MIME_TYPE = $( file -b --mime-type " ${ IMAGE_PATH } " ) NUM_BYTES = $( wc -c < " ${ IMAGE_PATH } " ) DISPLAY_NAME = IMAGE tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Caption this image."}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Prompting with multiple images You can provide multiple images in a \ No newline at end of file diff --git a/docstore/516a0b8b-7a9b-44e9-80e1-0701b9a0d52a b/docstore/516a0b8b-7a9b-44e9-80e1-0701b9a0d52a new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/516a0b8b-7a9b-44e9-80e1-0701b9a0d52a @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/517b5b35-2a08-409e-b953-22332483b251 b/docstore/517b5b35-2a08-409e-b953-22332483b251 new file mode 100644 index 0000000000000000000000000000000000000000..0518ab105034006ce88b099ef8c0514e1eaf43e3 --- /dev/null +++ b/docstore/517b5b35-2a08-409e-b953-22332483b251 @@ -0,0 +1,3 @@ +URL: https://ai.google.dev/gemini-api/docs/tokens#media-token Title: Understand and count tokens | Gemini API | Google AI for Developers ================================================== + +Understand and count tokens | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Understand and count tokens Python JavaScript Go Gemini and other generative AI models process input and output at a granularity called a token . About tokens Tokens can be single characters like z or whole words like cat . Long words are broken up into several tokens. The set of all tokens used by the model is called the vocabulary, and the process of splitting text into tokens is called tokenization . For Gemini models, a token is equivalent to about 4 characters. 100 tokens is equal to about 60-80 English words. When billing is enabled, the cost of a call to the Gemini API is determined in part by the number of input and output tokens, so knowing how to count tokens can be helpful. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/517f1886-bac0-42f9-afc4-29a9668ee814 b/docstore/517f1886-bac0-42f9-afc4-29a9668ee814 new file mode 100644 index 0000000000000000000000000000000000000000..37fa730aa9280f3cac34df0c8f8ecdd2b308e691 --- /dev/null +++ b/docstore/517f1886-bac0-42f9-afc4-29a9668ee814 @@ -0,0 +1 @@ +operation . response . generated_videos ): client . files . download ( file = generated_video . video ) generated_video . video . save ( f "video { n } .mp4" ) # save the video JavaScript import { GoogleGenAI } from "@google/genai" ; import { createWriteStream } from "fs" ; import { Readable } from "stream" ; const ai = new GoogleGenAI ({}); async function main () { let operation = await ai . models . generateVideos ({ model : "veo-2.0-generate-001" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , config : { personGeneration : "dont_allow" , aspectRatio : "16:9" , }, }); while ( ! operation . done ) { await new Promise (( resolve ) = > setTimeout ( resolve , 10000 )); operation = await ai . operations . getVideosOperation ({ operation : operation , }); } operation . response ? . generatedVideos ? . forEach ( async ( generatedVideo , n ) = > { const resp = await fetch ( ` ${ generatedVideo . video ? . uri } & key=GEMINI_API_KEY` ); // append your API key const writer = createWriteStream ( `video ${ n } .mp4` ); Readable . fromWeb ( resp . body ). pipe ( writer ); }); } main (); Go package main import ( "context" "fmt" "os" "time" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } videoConfig := & genai . GenerateVideosConfig { AspectRatio : "16:9" , PersonGeneration : "dont_allow" , } operation , _ := client . Models . GenerateVideos ( ctx , "veo-2.0-generate-001" , "Panning wide shot of a calico kitten sleeping in the sunshine" , nil , videoConfig , ) for ! operation . Done { time . Sleep ( 20 * time . Second ) operation , _ = client . Operations . GetVideosOperation ( ctx , operation , nil ) } for n , video := range operation . Response . GeneratedVideos { client . Files . Download ( ctx , video . Video , nil ) fname := fmt . Sprintf ( "video_%d.mp4" , n ) _ = os . WriteFile ( fname , video . Video . VideoBytes , 0644 ) } } REST # \ No newline at end of file diff --git a/docstore/517f75ef-9884-4c11-b440-e628047f8eba b/docstore/517f75ef-9884-4c11-b440-e628047f8eba new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/517f75ef-9884-4c11-b440-e628047f8eba @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/5187deef-4e0b-4403-8713-19858d55f78a b/docstore/5187deef-4e0b-4403-8713-19858d55f78a new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/5187deef-4e0b-4403-8713-19858d55f78a @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/5195e3a3-3642-499e-b623-9f5e833108b0 b/docstore/5195e3a3-3642-499e-b623-9f5e833108b0 new file mode 100644 index 0000000000000000000000000000000000000000..9b857cf0b6fa55fe5dcad250bcdaa6c0fcc0f783 --- /dev/null +++ b/docstore/5195e3a3-3642-499e-b623-9f5e833108b0 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting_with_media?lang=python#main-content Title: Files API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/519721df-b713-4b86-9aa3-2e8198a4b24f b/docstore/519721df-b713-4b86-9aa3-2e8198a4b24f new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/519721df-b713-4b86-9aa3-2e8198a4b24f @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/51ca10a3-6aea-497e-a7ed-22ab76b68591 b/docstore/51ca10a3-6aea-497e-a7ed-22ab76b68591 new file mode 100644 index 0000000000000000000000000000000000000000..cc60911e84f69efe9aed5a36f728569e3615d06d --- /dev/null +++ b/docstore/51ca10a3-6aea-497e-a7ed-22ab76b68591 @@ -0,0 +1 @@ +image_file . read ()) . decode ( 'utf-8' ) # Getting the base64 string base64_image = encode_image ( "Path/to/agi/image.jpeg" ) response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = [ { "role" : "user" , "content" : [ { "type" : "text" , "text" : "What is in this image?" , }, { "type" : "image_url" , "image_url" : { "url" : f "data:image/jpeg;base64, { base64_image } " }, }, ], } ], ) print ( response . choices [ 0 ]) JavaScript import OpenAI from "openai" ; import fs from 'fs/promises' ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function encodeImage ( imagePath ) { try { const imageBuffer = await fs . readFile ( imagePath ); return imageBuffer . toString ( 'base64' ); } catch ( error ) { console . error ( "Error encoding image:" , error ); return null ; } } async function main () { const imagePath = "Path/to/agi/image.jpeg" ; const base64Image = await encodeImage ( imagePath ); const messages = [ { "role" : "user" , "content" : [ { "type" : "text" , "text" : "What is in this image?" , }, { "type" : "image_url" , "image_url" : { "url" : `data:image/jpeg;base64, ${ base64Image } ` }, }, ], } ]; try { const response = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : messages , }); console . log ( response . choices [ 0 ]); } catch ( error ) { console . error ( "Error calling Gemini API:" , error ); } } main (); REST bash -c ' base64_image=$(base64 -i "Path/to/agi/image.jpeg"); curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d "{ \"model\": \"gemini-2.0-flash\", \"messages\": [ { \"role\": \"user\", \"content\": [ { \"type\": \"text\", \"text\": \"What is in this image?\" }, { \"type\": \"image_url\", \"image_url\": { \"url\": \"data:image/jpeg;base64,${base64_image}\" } } ] } ] }" ' \ No newline at end of file diff --git a/docstore/51eaa725-bcad-44a1-975f-56092ee690ba b/docstore/51eaa725-bcad-44a1-975f-56092ee690ba new file mode 100644 index 0000000000000000000000000000000000000000..3e0dca132b5cee05a4316835e4f2b62d82d3c7fe --- /dev/null +++ b/docstore/51eaa725-bcad-44a1-975f-56092ee690ba @@ -0,0 +1 @@ +Limits: Function descriptions and parameters count towards your input token limit. If you're hitting token limits, consider limiting the number of functions or the length of the descriptions, break down complex tasks into smaller, more focused function sets. Notes and limitations Only a subset of the OpenAPI schema is supported. Supported parameter types in Python are limited. Automatic function calling is a Python SDK feature only. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC. \ No newline at end of file diff --git a/docstore/521d1773-d407-4b53-ab03-8e68f193405a b/docstore/521d1773-d407-4b53-ab03-8e68f193405a new file mode 100644 index 0000000000000000000000000000000000000000..c1222b1eb00e14a7d2a482f186a5d8fda014fef3 --- /dev/null +++ b/docstore/521d1773-d407-4b53-ab03-8e68f193405a @@ -0,0 +1 @@ +person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own?` ; let thoughts = "" ; let answer = "" ; async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-pro" , contents : prompt , config : { thinkingConfig : { includeThoughts : true , }, }, }); for await ( const chunk of response ) { for ( const part of chunk . candidates [ 0 ]. content . parts ) { if ( ! part . text ) { continue ; } else if ( part . thought ) { if ( ! thoughts ) { console . log ( "Thoughts summary:" ); } console . log ( part . text ); thoughts = thoughts + part . text ; } else { if ( ! answer ) { console . log ( "Answer:" ); } console . log ( part . text ); answer = answer + part . text ; } } } } await main (); Go package main import ( "context" "fmt" "log" "os" "google.golang.org/genai" ) const prompt = ` Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own? ` func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := genai . Text ( prompt ) model := "gemini-2.5-pro" resp := client . Models . GenerateContentStream ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { IncludeThoughts : true , }, }) for chunk := range resp { for _ , part := range chunk . Candidates [ 0 ]. Content . Parts { if len ( part . Text ) == 0 { continue } if part . Thought { fmt . Printf ( "Thought: %s\n" , part . Text ) } else { fmt . Printf ( "Answer: %s\n" , part . Text ) } } } } Thought signatures \ No newline at end of file diff --git a/docstore/522c5f81-bd62-4725-9c61-149603354613 b/docstore/522c5f81-bd62-4725-9c61-149603354613 new file mode 100644 index 0000000000000000000000000000000000000000..d028e190ded4732dff341fe647fe7532c210272b --- /dev/null +++ b/docstore/522c5f81-bd62-4725-9c61-149603354613 @@ -0,0 +1 @@ +second image shows an empty hallway in a shopping mall, which is a place where people often go to buy things for their trips. The third image shows a stack of toilet paper, which is something that people often use when they are traveling. The fourth image shows a dark and stormy sea, which is something that people often encounter when they are traveling. The same tips work for creative output from images. In the following example with the rabbit, explicitly asking the model to include aspects from the image can improve the output. Prompt Model response Can you write me a poem about this image? Little bunny, so soft and brown, Hopping through the field, up and down. Your ears so long, your tail so small, You're the cutest creature of them all. You nibble on the grass, so green and sweet, And hop away when you see me. But I'll always be your friend, Little bunny, until the very end. Updated prompt Improved response Can you write me a descriptive and dramatic poem about this image and include the location? Little bunny on the green, Ears so long, so keen. What do you see out there, In the distance, where you stare? Is it the ocean blue, Sparkling in the sun's hue? Or the waves that crash and roar, A rhythm that you adore? Little bunny, tell me true, What captures your attention so? Is it the salty breeze, Or the seagulls' cries that please? Perhaps it's the vastness of it all, A world so big and small. A place where dreams can roam, And adventures can be sown. Little bunny, I wonder what you dream, As you sit on the grass, so serene. Do you yearn to explore the deep, Or stay on land, where you can leap? Whatever it may be, little bunny, Keep that spark of wonder burning brightly. For in your dreams and aspirations, Lies a world waiting for your creation. Troubleshooting which part of the prompt failed It can be hard to know whether a prompt failed because the model didn't understand the image to begin with, or if it did understand the image but did not perform the \ No newline at end of file diff --git a/docstore/524534e7-f5b1-4900-a1ec-ef0b98ec1fe7 b/docstore/524534e7-f5b1-4900-a1ec-ef0b98ec1fe7 new file mode 100644 index 0000000000000000000000000000000000000000..eb233fee8099f5f789dde3693dc446d13c990aff --- /dev/null +++ b/docstore/524534e7-f5b1-4900-a1ec-ef0b98ec1fe7 @@ -0,0 +1 @@ +UploadFile ( ... ) After (Centralized Client Object) Python from google import genai # Create a single client object client = genai . Client () # Access API methods through services on the client object response = client . models . generate_content ( ... ) chat = client . chats . create ( ... ) my_file = client . files . upload ( ... ) tuning_job = client . tunings . tune ( ... ) JavaScript import { GoogleGenAI } from "@google/genai" ; // Create a single client object const ai = new GoogleGenAI ({ apiKey : "YOUR_API_KEY" }); // Access API methods through services on the client object const response = await ai . models . generateContent (...); const chat = ai . chats . create (...); const uploadedFile = await ai . files . upload (...); const cache = await ai . caches . create (...); Go import "google.golang.org/genai" // Create a single client object client , err := genai . NewClient ( ctx , nil ) // Access API methods through services on the client object result , err := client . Models . GenerateContent ( ... ) chat , err := client . Chats . Create ( ... ) uploadedFile , err := client . Files . Upload ( ... ) tuningJob , err := client . Tunings . Tune ( ... ) Authentication Both legacy and new libraries authenticate using API keys. You can create your API key in Google AI Studio. Before Python The old SDK handled the API client object implicitly. import google.generativeai as genai genai . configure ( api_key =... ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); Go Import the Google libraries: import ( "github.com/google/generative-ai-go/genai" "google.golang.org/api/option" ) Create the client: client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) After Python With Google GenAI SDK, you create an API client first, which is used to call the API. The new SDK will pick up your API key from either one of the GEMINI_API_KEY or GOOGLE_API_KEY environment \ No newline at end of file diff --git a/docstore/524fb514-218c-4fe3-a954-fae8b135e0f7 b/docstore/524fb514-218c-4fe3-a954-fae8b135e0f7 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/524fb514-218c-4fe3-a954-fae8b135e0f7 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/52707423-4c6d-47ea-b4bc-728de634e3a0 b/docstore/52707423-4c6d-47ea-b4bc-728de634e3a0 new file mode 100644 index 0000000000000000000000000000000000000000..7e8ad5dab4abf30afa5f2a7a1bb2bb58fcb2f5d0 --- /dev/null +++ b/docstore/52707423-4c6d-47ea-b4bc-728de634e3a0 @@ -0,0 +1 @@ +-X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"' " $MIME_TYPE " '", "data": "' " $IMAGE_B64 " '" } }, {"text": "Caption this image."} ] }] }' 2 > /dev/null Note: Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly. Uploading images using the File API For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to generateContent . See the Files API guide for more information and examples. Python from google import genai client = genai . Client () my_file = client . files . upload ( file = "path/to/sample.jpg" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ my_file , "Caption this image." ], ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.jpg" , config : { mimeType : "image/jpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Caption this image." , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.jpg" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Caption this image." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * \ No newline at end of file diff --git a/docstore/5271d5ae-e3bf-4d31-a564-e81cd116ca79 b/docstore/5271d5ae-e3bf-4d31-a564-e81cd116ca79 new file mode 100644 index 0000000000000000000000000000000000000000..a8da0e92acb39cd3b0a95b76d425835072520bc1 --- /dev/null +++ b/docstore/5271d5ae-e3bf-4d31-a564-e81cd116ca79 @@ -0,0 +1 @@ +Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/docstore/528684fe-0a7d-4186-8dc2-c0b987baa5b8 b/docstore/528684fe-0a7d-4186-8dc2-c0b987baa5b8 new file mode 100644 index 0000000000000000000000000000000000000000..a69c7da1fd958501534ea95997f7f7e649d525ae --- /dev/null +++ b/docstore/528684fe-0a7d-4186-8dc2-c0b987baa5b8 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/embeddings#use-cases Title: Embeddings | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/528d0c93-e60e-41e7-baea-995715a037a6 b/docstore/528d0c93-e60e-41e7-baea-995715a037a6 new file mode 100644 index 0000000000000000000000000000000000000000..117db86db9001e6bbb18de854c742e372952ac97 --- /dev/null +++ b/docstore/528d0c93-e60e-41e7-baea-995715a037a6 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/document-processing Title: Document understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/52c63f26-842f-4fa1-868a-eabbf2b9659e b/docstore/52c63f26-842f-4fa1-868a-eabbf2b9659e new file mode 100644 index 0000000000000000000000000000000000000000..6753b1d43ae21393f7ea777aad5afc4becddf540 --- /dev/null +++ b/docstore/52c63f26-842f-4fa1-868a-eabbf2b9659e @@ -0,0 +1 @@ +response will include a thought_signature field containing an encrypted representation of the model's reasoning. Return the signature: When you send the function's execution result back to the server, include the thought_signature you received. This allows the model to restore its previous thinking context and will likely result in better function calling performance. Receiving signatures from the server Signatures are returned in the part after the model's thinking phase, which typically is a text or function call. Here are some examples of what thought signatures look like returned in each type of part, in response to the request "What's the weather in Lake Tahoe?" using the Get Weather example: Text part [{ "candidates" : [ { "content" : { "parts" : [ { "text" : "Here's what the weather in Lake Tahoe is today" , "thoughtSignature" : "ClcBVKhc7ru7KzUI7SrdUoIdAYLm/+i93aHjfIt4xHyAoO/G70tApxnK2ujBhOhC1PrRy1pkQa88fqFvpHNVd1HDjNLO7mkp6/hFwE+SPPEB3fh0hs4oM8MKhgIBVKhc7uIGvrS7i/T4HpfbnYrluFfWNjZ62gewqe4cVdR/Dlh+zbjtYmDD0gPZ+SuBO7vvHQdzsjePRP+2Y5XddX6LEf/cGGgakq8EhVvw/a6IVzUO6XmpHg2Ag1sl8E9+VFH/lC0R0ZuYdFWligtDuYwp5p5q3o59G0TtWeU2MC1y2MJfE9u/KWd313ldka80/X2W/xF2O/4djMp5G2WKcULfve75zeRCy0mc5iS3SB9mTH0cT6x0vtKjeBx50gcg+CQWtJcRuwTVzz54dmvmK9xvnqA8gKGw3DuaM9wfy5hyY7Qg0z3iyyWdP8T/lbjKim8IEQOk7O1vVwP1Ko7oMYH8JgA1CsoBAVSoXO6v4c5RSyd1cn6EIU0pEFQsjW7rYWPuZdOFq/tsGJT9BCfW7KGkPGwlNSq8jTJFvbcJ/DjtndISQYXwiXd2kGa5JfdS2Kh4zOxCxiWtOk+2nCc3+XQk2nonhO+esGJpkDdbbHZSqRgcUtYKq7q28iPFOQvOFyCiZNB7K86Z/6Hnagu2snSlN/BcTMaFGaWpcCClSUo4foRZn3WbNCoM8rcpD7qEJMp4a5baaSxyyeL1ZTGd2HLpFys/oiW6e3oAnhxuIysCwg==" } ] , "role" : "model" } , "index" : 0 } ] , # Remainder of response... Function call part [{ "candidates" : [ { "content" : { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/52c795be-b6c8-4830-90eb-2016fc1330e1 b/docstore/52c795be-b6c8-4830-90eb-2016fc1330e1 new file mode 100644 index 0000000000000000000000000000000000000000..b58c189db2b124c61872134bdd4b3d786a4e18e6 --- /dev/null +++ b/docstore/52c795be-b6c8-4830-90eb-2016fc1330e1 @@ -0,0 +1 @@ +you can read about configuring function calling . Python from google import genai from google.genai import types # Configure the client and tools client = genai . Client () house_tools = [ types . Tool ( function_declarations = [ power_disco_ball , start_music , dim_lights ]) ] config = types . GenerateContentConfig ( tools = house_tools , automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ), # Force the model to call 'any' function, instead of chatting. tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = 'ANY' ) ), ) chat = client . chats . create ( model = "gemini-2.5-flash" , config = config ) response = chat . send_message ( "Turn this place into a party!" ) # Print out each of the function calls requested from this single call print ( "Example 1: Forced function calling" ) for fn in response . function_calls : args = ", " . join ( f " { key } = { val } " for key , val in fn . args . items ()) print ( f " { fn . name } ( { args } )" ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Set up function declarations const houseFns = [ powerDiscoBall , startMusic , dimLights ]; const config = { tools : [{ functionDeclarations : houseFns }], // Force the model to call 'any' function, instead of chatting. toolConfig : { functionCallingConfig : { mode : 'any' } } }; // Configure the client const ai = new GoogleGenAI ({}); // Create a chat session const chat = ai . chats . create ({ model : 'gemini-2.5-flash' , config : config }); const response = await chat . sendMessage ({ message : 'Turn this place into a party!' }); // Print out each of the function calls requested from this single call console . log ( "Example 1: Forced function calling" ); for ( const fn of response . functionCalls ) { const args = Object . entries ( fn . args ) . map (([ key , val ]) = > ` ${ key } = ${ val } ` ) . join ( ', ' ); console . log ( ` ${ fn . name } ( ${ args } )` ); } Each of the printed \ No newline at end of file diff --git a/docstore/530c7eee-042e-4993-94ee-300f54cec699 b/docstore/530c7eee-042e-4993-94ee-300f54cec699 new file mode 100644 index 0000000000000000000000000000000000000000..2426e4316b986fa0eda84ec610a81c084b69e3a5 --- /dev/null +++ b/docstore/530c7eee-042e-4993-94ee-300f54cec699 @@ -0,0 +1 @@ += genai . embed_content ( model = 'models/text-embedding-004' , content = 'Hello world' ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "text-embedding-004" , }); const result = await model . embedContent ( "Hello world!" ); console . log ( result . embedding ); After Python from google import genai client = genai . Client () response = client . models . embed_content ( model = 'text-embedding-004' , contents = 'Hello world' , ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const text = "Hello World!" ; const result = await ai . models . embedContent ({ model : "text-embedding-004" , contents : text , config : { outputDimensionality : 10 }, }); console . log ( result . embeddings ); Tune a Model Create and use a tuned model. The new SDK simplifies tuning with client.tunings.tune , which launches the tuning job and polls until the job is complete. Before Python import google.generativeai as genai import random # create tuning model train_data = {} for i in range ( 1 , 6 ): key = f 'input { i } ' value = f 'output { i } ' train_data [ key ] = value name = f 'generate-num- { random . randint ( 0 , 10000 ) } ' operation = genai . create_tuned_model ( source_model = 'models/gemini-1.5-flash-001-tuning' , training_data = train_data , id = name , epoch_count = 5 , batch_size = 4 , learning_rate = 0.001 , ) # wait for tuning complete tuningProgress = operation . result () # generate content with the tuned model model = genai . GenerativeModel ( model_name = f 'tunedModels/ { name } ' ) response = model . generate_content ( '55' ) After Python from google import genai from google.genai import types client = genai . Client () # Check which models are available for tuning. for m in client . models . list (): for action in m . supported_actions : if action == \ No newline at end of file diff --git a/docstore/53165ced-fe31-4113-899e-d9c7b2b397c7 b/docstore/53165ced-fe31-4113-899e-d9c7b2b397c7 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/53165ced-fe31-4113-899e-d9c7b2b397c7 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/532ea422-4ad8-44ac-8b4c-d2082cfa55ee b/docstore/532ea422-4ad8-44ac-8b4c-d2082cfa55ee new file mode 100644 index 0000000000000000000000000000000000000000..46174b2df37cd8095bfdfeb5c7ac782d0d3c57c5 --- /dev/null +++ b/docstore/532ea422-4ad8-44ac-8b4c-d2082cfa55ee @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting-strategies#completion Title: Prompt design strategies | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/533a1e62-5564-4e5f-9bb9-e2286c5720cd b/docstore/533a1e62-5564-4e5f-9bb9-e2286c5720cd new file mode 100644 index 0000000000000000000000000000000000000000..69559ef57aa2a80df61889983510f2d97741fc66 --- /dev/null +++ b/docstore/533a1e62-5564-4e5f-9bb9-e2286c5720cd @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting-intro#prefixes Title: Prompt design strategies | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/534b856b-6bbd-4938-81db-2e926b980fb5 b/docstore/534b856b-6bbd-4938-81db-2e926b980fb5 new file mode 100644 index 0000000000000000000000000000000000000000..2b6e55e3ae415c04ff420e9e56413156ffa5e0fd --- /dev/null +++ b/docstore/534b856b-6bbd-4938-81db-2e926b980fb5 @@ -0,0 +1 @@ +a sample rate of 24kHz. Python # Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav # Install helpers for converting files: pip install librosa soundfile import asyncio import io from pathlib import Path import wave from google import genai from google.genai import types import soundfile as sf import librosa client = genai . Client () # Half cascade model: # model = "gemini-live-2.5-flash-preview" # Native audio output model: model = "gemini-2.5-flash-preview-native-audio-dialog" config = { "response_modalities" : [ "AUDIO" ], "system_instruction" : "You are a helpful assistant and answer in a friendly tone." , } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : buffer = io . BytesIO () y , sr = librosa . load ( "sample.wav" , sr = 16000 ) sf . write ( buffer , y , sr , format = 'RAW' , subtype = 'PCM_16' ) buffer . seek ( 0 ) audio_bytes = buffer . read () # If already in correct format, you can use this: # audio_bytes = Path("sample.pcm").read_bytes() await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) wf = wave . open ( "audio.wav" , "wb" ) wf . setnchannels ( 1 ) wf . setsampwidth ( 2 ) wf . setframerate ( 24000 ) # Output is 24kHz async for response in session . receive (): if response . data is not None : wf . writeframes ( response . data ) # Un-comment this code to print audio data info # if response.server_content.model_turn is not None: # print(response.server_content.model_turn.parts[0].inline_data.mime_type) wf . close () if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; // npm install wavefile const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); // WARNING: Do not use API keys in \ No newline at end of file diff --git a/docstore/53728623-67b3-4181-9e01-04debccbd068 b/docstore/53728623-67b3-4181-9e01-04debccbd068 new file mode 100644 index 0000000000000000000000000000000000000000..2c0be12e5de5e086f593ed916f2fdecc774a45a4 --- /dev/null +++ b/docstore/53728623-67b3-4181-9e01-04debccbd068 @@ -0,0 +1 @@ +Text generation | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Text generation The Gemini API can generate text output from various inputs, including text, images, video, and audio, leveraging Gemini models. Here's a basic example that takes a single text input: Python from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents \ No newline at end of file diff --git a/docstore/537c48f2-a1a7-4046-85dd-caa778d55c1f b/docstore/537c48f2-a1a7-4046-85dd-caa778d55c1f new file mode 100644 index 0000000000000000000000000000000000000000..12eaa70edcc32ba40cc62f56233fa0adea94b474 --- /dev/null +++ b/docstore/537c48f2-a1a7-4046-85dd-caa778d55c1f @@ -0,0 +1 @@ +Gemini API libraries | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini API libraries When building with the Gemini API, we recommend using our official collection of libraries across major languages: the Google GenAI SDK . They are production ready under General Availability . Our samples and documentation across this site are built using these libraries. Note: If you're using one of our legacy libraries, we strongly recommend you migrate to the Google GenAI SDK. Review the legacy libraries section for more information. If you're new to the Gemini API, follow our quickstart guide to get started. Language support and installation The Google GenAI SDK is available for the Python, JavaScript/TypeScript, Go and Java languages. You can install each language's library using package managers, or visit their GitHub repos for further engagement: Python Library: google-genai GitHub Repository: googleapis/python-genai Installation: pip install google-genai JavaScript Library: @google/genai GitHub Repository: googleapis/js-genai Installation: npm install @google/genai Go Library: google.golang.org/genai GitHub Repository: googleapis/go-genai Installation: go get google.golang.org/genai Java Library: google-genai GitHub Repository: googleapis/java-genai Installation: If you're using Maven, add the following to your dependencies: com.google.genai google-genai 1.0.0 General availability We started rolling out the Google GenAI SDK in late 2024. As of May 2025, it reached General Availability (GA) across all supported platforms. This means \ No newline at end of file diff --git a/docstore/5389e5d6-d2f2-46cc-ad99-01dbee61cb1f b/docstore/5389e5d6-d2f2-46cc-ad99-01dbee61cb1f new file mode 100644 index 0000000000000000000000000000000000000000..ffa55cd17dc266b0e00c821779e2850dd473d215 --- /dev/null +++ b/docstore/5389e5d6-d2f2-46cc-ad99-01dbee61cb1f @@ -0,0 +1 @@ +"Error: { batch_job . error } " ) Retrieving results Once the job status indicates your batch job has succeeded, the results are available in the response field. Python import json # Use the name of the job you want to check # e.g., inline_batch_job.name from the previous step job_name = "YOUR_BATCH_JOB_NAME" batch_job = client . batches . get ( name = job_name ) if batch_job . state . name == 'JOB_STATE_SUCCEEDED' : # If batch job was created with a file if batch_job . dest and batch_job . dest . file_name : # Results are in a file result_file_name = batch_job . dest . file_name print ( f "Results are in file: { result_file_name } " ) print ( "Downloading result file content..." ) file_content = client . files . download ( file = result_file_name ) # Process file_content (bytes) as needed print ( file_content . decode ( 'utf-8' )) # If batch job was created with inline request elif batch_job . dest and batch_job . dest . inlined_responses : # Results are inline print ( "Results are inline:" ) for i , inline_response in enumerate ( batch_job . dest . inlined_responses ): print ( f "Response { i + 1 } :" ) if inline_response . response : # Accessing response, structure may vary. try : print ( inline_response . response . text ) except AttributeError : print ( inline_response . response ) # Fallback elif inline_response . error : print ( f "Error: { inline_response . error } " ) else : print ( "No results found (neither file nor inline)." ) else : print ( f "Job did not succeed. Final state: { batch_job . state . name } " ) if batch_job . error : print ( f "Error: { batch_job . error } " ) REST BATCH_NAME = "batches/123456" # Your batch job name curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type:application/json" 2 > /dev/null > batch_status.json if jq -r '.done' batch_status.json | grep -q "false" ; then echo "Batch has not finished processing" fi batch_state = $( jq -r '.metadata.state' \ No newline at end of file diff --git a/docstore/538c1847-bf78-4e0e-ae51-fab37b21889f b/docstore/538c1847-bf78-4e0e-ae51-fab37b21889f new file mode 100644 index 0000000000000000000000000000000000000000..c3dd5ccab4f70d7a2a8a4ffb78dc41dee62c1a3f --- /dev/null +++ b/docstore/538c1847-bf78-4e0e-ae51-fab37b21889f @@ -0,0 +1 @@ +with texts to make sense, use whatever order is most natural. Troubleshooting your multimodal prompt If the model is not drawing information from the relevant part of the image: Drop hints with which aspects of the image you want the prompt to draw information from. If the model output is too generic (not tailored enough to the image/video input): At the start of the prompt, try asking the model to describe the image(s) or video before providing the task instruction, or try asking the model to refer to what's in the image. To troubleshoot which part failed: Ask the model to describe the image, or ask the model to explain its reasoning, to gauge the model's initial understanding. If your prompt results in hallucinated content: Try dialing down the temperature setting or asking the model for shorter descriptions so that it's less likely to extrapolate additional details. Tuning the sampling parameters: Experiment with different temperature settings and top-k selections to adjust the model's creativity. Be specific in your instructions Prompts have the most success when they are clear and detailed. If you have a specific output in mind, it's better to include that requirement in the prompt to ensure you get the output you want. For this image of an airport board, asking the model to just "describe this image" could generate a general description. If you need the model to parse the time and city from the image, you can include that request directly in your prompt. Prompt Model response Describe this image. The image shows an airport arrivals and departures board. Updated prompt Improved response Parse the time and city from the airport board shown in this image into a list. 10:50 Moscow 11:05 Edinburgh 11:05 London 11:10 Bucharest 11:30 Kiev 11:35 Dublin 11:45 East Midlands 12:15 Sofia 12:30 London 12:30 Newcastle 12:40 St Petersburg 12:40 London 12:45 Manchester Add a few examples The Gemini model can accept multiple inputs which it can use as examples to understand \ No newline at end of file diff --git a/docstore/538d02c9-7adb-4e6b-86a9-94c78d05cb34 b/docstore/538d02c9-7adb-4e6b-86a9-94c78d05cb34 new file mode 100644 index 0000000000000000000000000000000000000000..71a40ddf8f5869779c79b3a864f86f411dc17d60 --- /dev/null +++ b/docstore/538d02c9-7adb-4e6b-86a9-94c78d05cb34 @@ -0,0 +1 @@ +correct reasoning steps afterward. To disambiguate between those reasons, ask the model to describe what's in the image. In the following example, if the model responds with a snack that seems surprising when paired with tea (e.g. popcorn), you can first troubleshoot to determine whether the model correctly recognized that the image contains tea. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? Describe what's in this image. Another strategy is to ask the model to explain its reasoning. That can help you narrow down which part of the reasoning broke down, if any. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? What's a snack I can make in 1 minute that would go well with this? Please explain why. What's next Try writing your own multimodal prompts using Google AI Studio . For information on using the Gemini Files API for uploading media files and including them in your prompts, see the Vision , Audio , and Document processing guides. For more guidance on prompt design, like tuning sampling parameters, see the Prompt strategies page. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/538e3ebf-0a3b-4e8d-92f7-a26c7edd9387 b/docstore/538e3ebf-0a3b-4e8d-92f7-a26c7edd9387 new file mode 100644 index 0000000000000000000000000000000000000000..c3dd5ccab4f70d7a2a8a4ffb78dc41dee62c1a3f --- /dev/null +++ b/docstore/538e3ebf-0a3b-4e8d-92f7-a26c7edd9387 @@ -0,0 +1 @@ +with texts to make sense, use whatever order is most natural. Troubleshooting your multimodal prompt If the model is not drawing information from the relevant part of the image: Drop hints with which aspects of the image you want the prompt to draw information from. If the model output is too generic (not tailored enough to the image/video input): At the start of the prompt, try asking the model to describe the image(s) or video before providing the task instruction, or try asking the model to refer to what's in the image. To troubleshoot which part failed: Ask the model to describe the image, or ask the model to explain its reasoning, to gauge the model's initial understanding. If your prompt results in hallucinated content: Try dialing down the temperature setting or asking the model for shorter descriptions so that it's less likely to extrapolate additional details. Tuning the sampling parameters: Experiment with different temperature settings and top-k selections to adjust the model's creativity. Be specific in your instructions Prompts have the most success when they are clear and detailed. If you have a specific output in mind, it's better to include that requirement in the prompt to ensure you get the output you want. For this image of an airport board, asking the model to just "describe this image" could generate a general description. If you need the model to parse the time and city from the image, you can include that request directly in your prompt. Prompt Model response Describe this image. The image shows an airport arrivals and departures board. Updated prompt Improved response Parse the time and city from the airport board shown in this image into a list. 10:50 Moscow 11:05 Edinburgh 11:05 London 11:10 Bucharest 11:30 Kiev 11:35 Dublin 11:45 East Midlands 12:15 Sofia 12:30 London 12:30 Newcastle 12:40 St Petersburg 12:40 London 12:45 Manchester Add a few examples The Gemini model can accept multiple inputs which it can use as examples to understand \ No newline at end of file diff --git a/docstore/53a4eaf8-f296-4244-9114-1e2e7ba777a5 b/docstore/53a4eaf8-f296-4244-9114-1e2e7ba777a5 new file mode 100644 index 0000000000000000000000000000000000000000..58c080b28fe0c0a0f77a553f5c6816f2c420fdd2 --- /dev/null +++ b/docstore/53a4eaf8-f296-4244-9114-1e2e7ba777a5 @@ -0,0 +1 @@ +the sum of the first 50 prime numbers? " "Generate and run code for the calculation, and make sure you get all 50." ) for part in response . candidates [ 0 ] . content . parts : if part . text is not None : print ( part . text ) if part . executable_code is not None : print ( part . executable_code . code ) if part . code_execution_result is not None : print ( part . code_execution_result . output ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "I have a math question for you:" }], }, { role : "model" , parts : [{ text : "Great! I'm ready for your math question. Please ask away." }], }, ], config : { tools : [{ codeExecution : {}}], } }); const response = await chat . sendMessage ({ message : "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and make sure you get all 50." }); console . log ( "Chat response:" , response . text ); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { Tools : [] * genai . Tool { { CodeExecution : & genai . ToolCodeExecution {}}, }, } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , config , nil , ) result , _ := chat . SendMessage ( ctx , genai . Part { Text : "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and " + "make sure you get all 50." , }, ) fmt . Println ( result . Text ()) fmt . Println ( result . ExecutableCode ()) fmt . Println ( result . CodeExecutionResult ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{"tools": \ No newline at end of file diff --git a/docstore/53afeef5-57fb-4e26-bdb3-98678bdef96d b/docstore/53afeef5-57fb-4e26-bdb3-98678bdef96d new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/53afeef5-57fb-4e26-bdb3-98678bdef96d @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/53b6ec78-eaaa-4061-8b08-027566d84f0a b/docstore/53b6ec78-eaaa-4061-8b08-027566d84f0a new file mode 100644 index 0000000000000000000000000000000000000000..47c113cd9bacf84a434d531b6a240f0025b74130 --- /dev/null +++ b/docstore/53b6ec78-eaaa-4061-8b08-027566d84f0a @@ -0,0 +1 @@ +. For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC. \ No newline at end of file diff --git a/docstore/53b7194e-6f9b-4b36-a9a8-ddfe21807249 b/docstore/53b7194e-6f9b-4b36-a9a8-ddfe21807249 new file mode 100644 index 0000000000000000000000000000000000000000..76ace95fe58e8b11b8d859b0314962fe72bedd09 --- /dev/null +++ b/docstore/53b7194e-6f9b-4b36-a9a8-ddfe21807249 @@ -0,0 +1 @@ +() { const payload = { contents : [ { parts : [ { text : 'Explain how AI works' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Multi-turn conversations (Chat) Our SDKs provide functionality to collect multiple rounds of prompts and responses into a chat, giving you an easy way to keep track of the conversation history. Note: Chat functionality is only implemented as part of the SDKs. Behind the scenes, it still uses the generateContent API. For multi-turn conversations, the full conversation history is sent to the model with each follow-up turn. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message ( "I have 2 dogs in my house." ) print ( response . text ) response = chat . send_message ( "How many paws are in my house?" ) print ( response . text ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const response1 = await chat . sendMessage ({ message : "I have 2 dogs in my house." , }); console . log ( "Chat response 1:" , response1 . text ); const response2 = await chat . sendMessage ({ message : "How many paws are in \ No newline at end of file diff --git a/docstore/53c5d1ed-b826-48d6-98d8-0623ea82626d b/docstore/53c5d1ed-b826-48d6-98d8-0623ea82626d new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/53c5d1ed-b826-48d6-98d8-0623ea82626d @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/53f8041e-3cbe-4ce6-acc3-f62289c3a56c b/docstore/53f8041e-3cbe-4ce6-acc3-f62289c3a56c new file mode 100644 index 0000000000000000000000000000000000000000..f9adff9b67eac5b5b36a6074a1ab529fecc1cb9f --- /dev/null +++ b/docstore/53f8041e-3cbe-4ce6-acc3-f62289c3a56c @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/downloads Title: Gemini API libraries | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/5429482c-8e2b-4a89-99de-85c343bf0394 b/docstore/5429482c-8e2b-4a89-99de-85c343bf0394 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/5429482c-8e2b-4a89-99de-85c343bf0394 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/54526a21-7b90-4277-9253-066651d5e89c b/docstore/54526a21-7b90-4277-9253-066651d5e89c new file mode 100644 index 0000000000000000000000000000000000000000..facd4f50718f089582ed06a7b9e2ea144ce56d9c --- /dev/null +++ b/docstore/54526a21-7b90-4277-9253-066651d5e89c @@ -0,0 +1 @@ +instructions, such as: "Show bounding boxes of all green objects in this image". It also support custom labels like "label the items with the allergens they can contain". For more examples, check following notebooks in the Gemini Cookbook : 2D spatial understanding notebook Experimental 3D pointing notebook Segmentation Starting with Gemini 2.5, models not only detect items but also segment them and provide their contour masks. The model predicts a JSON list, where each item represents a segmentation mask. Each item has a bounding box (" box_2d ") in the format [y0, x0, y1, x1] with normalized coordinates between 0 and 1000, a label (" label ") that identifies the object, and finally the segmentation mask inside the bounding box, as base64 encoded png that is a probability map with values between 0 and 255. The mask needs to be resized to match the bounding box dimensions, then binarized at your confidence threshold (127 for the midpoint). Note: For better results, disable thinking by setting the thinking budget to 0. See code sample below for an example. Python from google import genai from google.genai import types from PIL import Image , ImageDraw import io import base64 import json import numpy as np import os client = genai . Client () def parse_json ( json_output : str ): # Parsing out the markdown fencing lines = json_output . splitlines () for i , line in enumerate ( lines ): if line == "```json" : json_output = " \n " . join ( lines [ i + 1 :]) # Remove everything before "```json" output = json_output . split ( "```" )[ 0 ] # Remove everything after the closing "```" break # Exit the loop once "```json" is found return json_output def extract_segmentation_masks ( image_path : str , output_dir : str = "segmentation_outputs" ): # Load and resize image im = Image . open ( image_path ) im . thumbnail ([ 1024 , 1024 ], Image . Resampling . LANCZOS ) prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks \ No newline at end of file diff --git a/docstore/545ca48e-2eb0-4ba7-bb04-e336bb806dae b/docstore/545ca48e-2eb0-4ba7-bb04-e336bb806dae new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/545ca48e-2eb0-4ba7-bb04-e336bb806dae @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/54736aa3-ea25-45b3-94b5-5c33de1a64df b/docstore/54736aa3-ea25-45b3-94b5-5c33de1a64df new file mode 100644 index 0000000000000000000000000000000000000000..8b9d2b393068f4715ffb76c5e56984474c9f1ba2 --- /dev/null +++ b/docstore/54736aa3-ea25-45b3-94b5-5c33de1a64df @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-1.5-flash-8b Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/547dc67c-72a3-4ee3-9e9c-383df7cb5614 b/docstore/547dc67c-72a3-4ee3-9e9c-383df7cb5614 new file mode 100644 index 0000000000000000000000000000000000000000..c25aa08b43110ad03ba57bdce48865495fdfb941 --- /dev/null +++ b/docstore/547dc67c-72a3-4ee3-9e9c-383df7cb5614 @@ -0,0 +1 @@ +fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ], "generationConfig": { "thinkingConfig": { "thinkingBudget": 0 } } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } System instructions and other configurations You can guide the behavior of Gemini models with system instructions. To do so, pass a GenerateContentConfig object. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , config = types . GenerateContentConfig ( system_instruction = "You are a cat. Your name is Neko." ), contents = "Hello there" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Hello there" , config : { systemInstruction : "You are a cat. Your name is Neko." , }, }); console . log ( response . text ); } await main (); \ No newline at end of file diff --git a/docstore/54a10176-f16a-4b9e-9104-ee9b330fbb14 b/docstore/54a10176-f16a-4b9e-9104-ee9b330fbb14 new file mode 100644 index 0000000000000000000000000000000000000000..38b0d511edf99b6e46520aefe0a8854155d4053f --- /dev/null +++ b/docstore/54a10176-f16a-4b9e-9104-ee9b330fbb14 @@ -0,0 +1 @@ +energetic music, and dimmed the lights to 50% brightness. Let's get this party started! Compositional function calling Compositional or sequential function calling allows Gemini to chain multiple function calls together to fulfill a complex request. For example, to answer "Get the temperature in my current location", the Gemini API might first invoke a get_current_location() function followed by a get_weather() function that takes the location as a parameter. The following example demonstrates how to implement compositional function calling using the Python SDK and automatic function calling. Python This example uses the automatic function calling feature of the google-genai Python SDK. The SDK automatically converts the Python functions to the required schema, executes the function calls when requested by the model, and sends the results back to the model to complete the task. import os from google import genai from google.genai import types # Example Functions def get_weather_forecast ( location : str ) - > dict : """Gets the current weather temperature for a given location.""" print ( f "Tool Call: get_weather_forecast(location= { location } )" ) # TODO: Make API call print ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ) return { "temperature" : 25 , "unit" : "celsius" } # Dummy response def set_thermostat_temperature ( temperature : int ) - > dict : """Sets the thermostat to a desired temperature.""" print ( f "Tool Call: set_thermostat_temperature(temperature= { temperature } )" ) # TODO: Interact with a thermostat API print ( "Tool Response: {'status': 'success'}" ) return { "status" : "success" } # Configure the client and model client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_weather_forecast , set_thermostat_temperature ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to \ No newline at end of file diff --git a/docstore/54bca9c0-e9b5-44dc-9504-2cb40c550d83 b/docstore/54bca9c0-e9b5-44dc-9504-2cb40c550d83 new file mode 100644 index 0000000000000000000000000000000000000000..f768002e22e546af8fbd249f6201ab1a1006d078 --- /dev/null +++ b/docstore/54bca9c0-e9b5-44dc-9504-2cb40c550d83 @@ -0,0 +1 @@ +const data = response . candidates ? .[ 0 ] ? . content ? . parts ? .[ 0 ] ? . inlineData ? . data ; const audioBuffer = Buffer . from ( data , 'base64' ); const fileName = 'out.wav' ; await saveWaveFile ( fileName , audioBuffer ); } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -X POST \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts":[{ "text": "Say cheerfully: Have a wonderful day!" }] }], "generationConfig": { "responseModalities": ["AUDIO"], "speechConfig": { "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Kore" } } } }, "model": "gemini-2.5-flash-preview-tts", }' | jq -r '.candidates[0].content.parts[0].inlineData.data' | \ base64 --decode >out.pcm # You may need to install ffmpeg. ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm out.wav Multi-speaker text-to-speech For multi-speaker audio, you'll need a MultiSpeakerVoiceConfig object with each speaker (up to 2) configured as a SpeakerVoiceConfig . You'll need to define each speaker with the same names used in the prompt : Python from google import genai from google.genai import types import wave # Set up the wave file to save the output: def wave_file ( filename , pcm , channels = 1 , rate = 24000 , sample_width = 2 ): with wave . open ( filename , "wb" ) as wf : wf . setnchannels ( channels ) wf . setsampwidth ( sample_width ) wf . setframerate ( rate ) wf . writeframes ( pcm ) client = genai . Client () prompt = """TTS the following conversation between Joe and Jane: Joe: How's it going today Jane? Jane: Not too bad, how about you?""" response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = prompt , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( multi_speaker_voice_config = types . MultiSpeakerVoiceConfig ( speaker_voice_configs = [ types . \ No newline at end of file diff --git a/docstore/54c77c31-89e8-45d5-9540-7714ea8b7f9a b/docstore/54c77c31-89e8-45d5-9540-7714ea8b7f9a new file mode 100644 index 0000000000000000000000000000000000000000..de98b41ec31106077167d65dc0d83dfd4822d872 --- /dev/null +++ b/docstore/54c77c31-89e8-45d5-9540-7714ea8b7f9a @@ -0,0 +1 @@ +moment. The run() function declaration, which handles the asynchronous websocket setup, is omitted for brevity. Python # Multiple tasks example - combining lights, code execution, and search prompt = """ Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! """ tools = [ { 'google_search' : {}}, { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} # not defined here. ] # Execute the prompt with specified tools in audio modality await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Multiple tasks example - combining lights, code execution, and search const prompt = ` Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! ` ; const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } // not defined here. ]; // Execute the prompt with specified tools in audio modality await run ( prompt , { tools : tools , modality : "AUDIO" }); Python developers can try this out in the Live API Tool Use notebook . Model context protocol (MCP) Model Context Protocol (MCP) is an open standard for connecting AI applications with external tools and data. MCP provides a common protocol for models to access context, such as functions (tools), data sources (resources), or predefined prompts. The Gemini SDKs have built-in support for the MCP, reducing boilerplate code and offering automatic tool calling for MCP tools. When the model generates an MCP tool call, the Python and JavaScript client SDK can automatically execute the MCP tool and send the \ No newline at end of file diff --git a/docstore/54da9e69-2445-4e3f-8adf-04c1a88f0f0d b/docstore/54da9e69-2445-4e3f-8adf-04c1a88f0f0d new file mode 100644 index 0000000000000000000000000000000000000000..d86b97ebb931c5d5bb58bc99d13817b9beccf2a0 --- /dev/null +++ b/docstore/54da9e69-2445-4e3f-8adf-04c1a88f0f0d @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/function-calling#function_calling_modes Title: Function calling with the Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/54ddb1de-acc6-4a95-898f-ef85d940edb0 b/docstore/54ddb1de-acc6-4a95-898f-ef85d940edb0 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/54ddb1de-acc6-4a95-898f-ef85d940edb0 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/54e14dc0-7957-4372-a697-2113de5615e7 b/docstore/54e14dc0-7957-4372-a697-2113de5615e7 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/54e14dc0-7957-4372-a697-2113de5615e7 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/55140b8d-4612-497f-b0e6-5a349b663d47 b/docstore/55140b8d-4612-497f-b0e6-5a349b663d47 new file mode 100644 index 0000000000000000000000000000000000000000..805d10be613a3c55006267c8f5d9ce17fbca450d --- /dev/null +++ b/docstore/55140b8d-4612-497f-b0e6-5a349b663d47 @@ -0,0 +1 @@ +declaration can include the following parameters: name (string): A unique name for the function ( get_weather_forecast , send_email ). Use descriptive names without spaces or special characters (use underscores or camelCase). description (string): A clear and detailed explanation of the function's purpose and capabilities. This is crucial for the model to understand when to use the function. Be specific and provide examples if helpful ("Finds theaters based on location and optionally movie title which is currently playing in theaters."). parameters (object): Defines the input parameters the function expects. type (string): Specifies the overall data type, such as object . properties (object): Lists individual parameters, each with: type (string): The data type of the parameter, such as string , integer , boolean, array . description (string): A description of the parameter's purpose and format. Provide examples and constraints ("The city and state, e.g., 'San Francisco, CA' or a zip code e.g., '95616'."). enum (array, optional): If the parameter values are from a fixed set, use "enum" to list the allowed values instead of just describing them in the description. This improves accuracy ("enum": ["daylight", "cool", "warm"]). required (array): An array of strings listing the parameter names that are mandatory for the function to operate. Function calling with thinking Enabling "thinking" can improve function call performance by allowing the model to reason through a request before suggesting function calls. However, because the Gemini API is stateless, this reasoning context is lost between turns, which can reduce the quality of function calls as they require multiple turn requests. To preserve this context you can use thought signatures. A thought signature is an encrypted representation of the model's internal thought process that you pass back to the model on subsequent turns. To use thought signatures: Receive the signature: When thinking is enabled, the API \ No newline at end of file diff --git a/docstore/5515c074-ff01-47eb-99da-4f1bf7fcb32e b/docstore/5515c074-ff01-47eb-99da-4f1bf7fcb32e new file mode 100644 index 0000000000000000000000000000000000000000..70023e78d7b8c9459310371555beec5ba6328e28 --- /dev/null +++ b/docstore/5515c074-ff01-47eb-99da-4f1bf7fcb32e @@ -0,0 +1 @@ +results reflects a single function call that the model has requested. To send the results back, include the responses in the same order as they were requested. The Python SDK supports automatic function calling , which automatically converts Python functions to declarations, handles the function call execution and response cycle for you. Following is an example for the disco use case. Note: Automatic Function Calling is a Python SDK only feature at the moment. Python from google import genai from google.genai import types # Actual function implementations def power_disco_ball_impl ( power : bool ) - > dict : """Powers the spinning disco ball. Args: power: Whether to turn the disco ball on or off. Returns: A status dictionary indicating the current state. """ return { "status" : f "Disco ball powered { 'on' if power else 'off' } " } def start_music_impl ( energetic : bool , loud : bool ) - > dict : """Play some music matching the specified parameters. Args: energetic: Whether the music is energetic or not. loud: Whether the music is loud or not. Returns: A dictionary containing the music settings. """ music_type = "energetic" if energetic else "chill" volume = "loud" if loud else "quiet" return { "music_type" : music_type , "volume" : volume } def dim_lights_impl ( brightness : float ) - > dict : """Dim the lights. Args: brightness: The brightness of the lights, 0.0 is off, 1.0 is full. Returns: A dictionary containing the new brightness setting. """ return { "brightness" : brightness } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ power_disco_ball_impl , start_music_impl , dim_lights_impl ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Do everything you need to this place into party!" , config = config , ) print ( " \n Example 2: Automatic function calling" ) print ( response . text ) # I've turned on the disco ball, started playing loud and \ No newline at end of file diff --git a/docstore/552d44b6-3b89-4c3b-83c1-1fc755d3bdcb b/docstore/552d44b6-3b89-4c3b-83c1-1fc755d3bdcb new file mode 100644 index 0000000000000000000000000000000000000000..1983a1b7b4b0634f95c028654d1fae0a75b50e6a --- /dev/null +++ b/docstore/552d44b6-3b89-4c3b-83c1-1fc755d3bdcb @@ -0,0 +1 @@ +{ mode : 'any' } } }; // Configure the client const ai = new GoogleGenAI ({}); // Create a chat session const chat = ai . chats . create ({ model : 'gemini-2.5-flash' , config : config }); const response = await chat . sendMessage ({ message : 'Turn this place into a party!' }); // Print out each of the function calls requested from this single call console . log ( "Example 1: Forced function calling" ); for ( const fn of response . functionCalls ) { const args = Object . entries ( fn . args ) . map (([ key , val ]) = > ` ${ key } = ${ val } ` ) . join ( ', ' ); console . log ( ` ${ fn . name } ( ${ args } )` ); } Each of the printed results reflects a single function call that the model has requested. To send the results back, include the responses in the same order as they were requested. The Python SDK supports automatic function calling , which automatically converts Python functions to declarations, handles the function call execution and response cycle for you. Following is an example for the disco use case. Note: Automatic Function Calling is a Python SDK only feature at the moment. Python from google import genai from google.genai import types # Actual function implementations def power_disco_ball_impl ( power : bool ) - > dict : """Powers the spinning disco ball. Args: power: Whether to turn the disco ball on or off. Returns: A status dictionary indicating the current state. """ return { "status" : f "Disco ball powered { 'on' if power else 'off' } " } def start_music_impl ( energetic : bool , loud : bool ) - > dict : """Play some music matching the specified parameters. Args: energetic: Whether the music is energetic or not. loud: Whether the music is loud or not. Returns: A dictionary containing the music settings. """ music_type = "energetic" if energetic else "chill" volume = "loud" if loud else "quiet" return { "music_type" : music_type , "volume" : volume } def dim_lights_impl ( brightness : float ) - > dict : """Dim the lights. Args: brightness: The \ No newline at end of file diff --git a/docstore/55346b20-030f-4cb8-8316-124f48482b3e b/docstore/55346b20-030f-4cb8-8316-124f48482b3e new file mode 100644 index 0000000000000000000000000000000000000000..a70be0d04647e3405f67cb87b3156d8922b7c210 --- /dev/null +++ b/docstore/55346b20-030f-4cb8-8316-124f48482b3e @@ -0,0 +1 @@ +to the examples provides labels that the model can use when generating the output, which makes it easier to parse output content. In the following example, "Text:" is the input prefix and "The answer is:" is the output prefix. Prompt: Classify the text as one of the following categories. - large - small Text: Rhino The answer is: large Text: Mouse The answer is: small Text: Snail The answer is: small Text: Elephant The answer is: Response: The answer is: large (gemini-2.5-flash) Break down prompts into components For use cases that require complex prompts, you can help the model manage this complexity by breaking things down into simpler components. Break down instructions: Instead of having many instructions in one prompt, create one prompt per instruction. You can choose which prompt to process based on the user's input. Chain prompts: For complex tasks that involve multiple sequential steps, make each step a prompt and chain the prompts together in a sequence. In this sequential chain of prompts, the output of one prompt in the sequence becomes the input of the next prompt. The output of the last prompt in the sequence is the final output. Aggregate responses: Aggregation is when you want to perform different parallel tasks on different portions of the data and aggregate the results to produce the final output. For example, you can tell the model to perform one operation on the first part of the data, perform another operation on the rest of the data and aggregate the results. Experiment with model parameters Each call that you send to a model includes parameter values that control how the model generates a response. The model can generate different results for different parameter values. Experiment with different parameter values to get the best values for the task. The parameters available for different models may differ. The most common parameters are the following: Max output tokens: Specifies the maximum number of tokens that can be generated in the \ No newline at end of file diff --git a/docstore/5535ed60-265a-4f12-94e4-3c071f060e5d b/docstore/5535ed60-265a-4f12-94e4-3c071f060e5d new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/5535ed60-265a-4f12-94e4-3c071f060e5d @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/5564a308-010c-4678-a84c-e4696b436760 b/docstore/5564a308-010c-4678-a84c-e4696b436760 new file mode 100644 index 0000000000000000000000000000000000000000..045707d455060dfd20be0644c14272aa57ff277b --- /dev/null +++ b/docstore/5564a308-010c-4678-a84c-e4696b436760 @@ -0,0 +1 @@ +"log" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } prompt := "Explain the concept of Occam's Razor and provide a simple, everyday example." model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , genai . Text ( prompt ), nil ) fmt . Println ( resp . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain the concept of Occam\' s Razor and provide a simple, everyday example. " } ] } ] }' ``` Thinking budgets The thinkingBudget parameter guides the model on the number of thinking tokens to use when generating a response. A higher token count generally allows for more detailed reasoning, which can be beneficial for tackling more complex tasks . If latency is more important, use a lower budget or disable thinking by setting thinkingBudget to 0. Setting the thinkingBudget to -1 turns on dynamic thinking , meaning the model will adjust the budget based on the complexity of the request. The thinkingBudget is only supported in Gemini 2.5 Flash, 2.5 Pro, and 2.5 Flash-Lite. Depending on the prompt, the model might overflow or underflow the token budget. The following are thinkingBudget configuration details for each model type. Model Default setting (Thinking budget is not set) Range Disable thinking Turn on dynamic thinking 2.5 Pro Dynamic thinking: Model decides when and how much to think 128 to 32768 N/A: Cannot disable thinking thinkingBudget = -1 2.5 Flash Dynamic thinking: Model decides when and how much to think 0 to 24576 thinkingBudget = 0 thinkingBudget = -1 2.5 Flash Lite Model does not think 512 to 24576 thinkingBudget = 0 thinkingBudget = -1 Python from google import genai from google.genai import types client = genai . \ No newline at end of file diff --git a/docstore/558c10a3-bdfa-4591-b503-7d01c7806687 b/docstore/558c10a3-bdfa-4591-b503-7d01c7806687 new file mode 100644 index 0000000000000000000000000000000000000000..e83e80a217e9ff779ee42e75d11ac96b01a2f185 --- /dev/null +++ b/docstore/558c10a3-bdfa-4591-b503-7d01c7806687 @@ -0,0 +1 @@ +function_calling_config = types . FunctionCallingConfig ( mode = "ANY" , allowed_function_names = [ "get_current_temperature" ] ) ) # Create the generation config config = types . GenerateContentConfig ( tools = [ tools ], # not defined here. tool_config = tool_config , ) JavaScript import { FunctionCallingConfigMode } from '@google/genai' ; // Configure function calling mode const toolConfig = { functionCallingConfig : { mode : FunctionCallingConfigMode . ANY , allowedFunctionNames : [ 'get_current_temperature' ] } }; // Create the generation config const config = { tools : tools , // not defined here. toolConfig : toolConfig , }; Automatic function calling (Python only) When using the Python SDK, you can provide Python functions directly as tools. The SDK automatically converts the Python function to declarations, handles the function call execution and the response cycle for you. The Python SDK then automatically: Detects function call responses from the model. Call the corresponding Python function in your code. Sends the function response back to the model. Returns the model's final text response. To use this, define your function with type hints and a docstring, and then pass the function itself (not a JSON declaration) as a tool: Python from google import genai from google.genai import types # Define the function with type hints and docstring def get_current_temperature ( location : str ) - > dict : """Gets the current temperature for a given location. Args: location: The city and state, e.g. San Francisco, CA Returns: A dictionary containing the temperature and unit. """ # ... (implementation) ... return { "temperature" : 25 , "unit" : "Celsius" } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_current_temperature ] ) # Pass the function itself # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What's the temperature in Boston?" , config = config \ No newline at end of file diff --git a/docstore/559091d7-d2c8-43af-8b91-4076d1113b2c b/docstore/559091d7-d2c8-43af-8b91-4076d1113b2c new file mode 100644 index 0000000000000000000000000000000000000000..c085d8aece3abc99a010c5a69268bce2397f0e27 --- /dev/null +++ b/docstore/559091d7-d2c8-43af-8b91-4076d1113b2c @@ -0,0 +1 @@ +100mm Macro lens Model: imagen-3.0-generate-002 Motion Use case Lens type Focal lengths Additional details Sports, wildlife (motion) Telephoto zoom 100-400mm Fast shutter speed, Action or movement tracking Using several keywords from the table, Imagen can generate the following motion images: Prompt: a winning touchdown, fast shutter speed, movement tracking Model: imagen-3.0-generate-002 Prompt: A deer running in the forest, fast shutter speed, movement tracking Model: imagen-3.0-generate-002 Wide-angle Use case Lens type Focal lengths Additional details Astronomical, landscape (wide-angle) Wide-angle 10-24mm Long exposure times, sharp focus, long exposure, smooth water or clouds Using several keywords from the table, Imagen can generate the following wide-angle images: Prompt: an expansive mountain range, landscape wide angle 10mm Model: imagen-3.0-generate-002 Prompt: a photo of the moon, astro photography, wide angle 10mm Model: imagen-3.0-generate-002 What's next Check out the Veo guide to learn how to generate videos with the Gemini API. To learn more about Gemini models, see Gemini models and Experimental models . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/55a4b683-c474-4986-a2a6-9a6675cc4e60 b/docstore/55a4b683-c474-4986-a2a6-9a6675cc4e60 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/55a4b683-c474-4986-a2a6-9a6675cc4e60 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/55ad1cd6-7d9b-4c43-8233-7b50e0487522 b/docstore/55ad1cd6-7d9b-4c43-8233-7b50e0487522 new file mode 100644 index 0000000000000000000000000000000000000000..1b38806cd93f51329872622740eee5221d7a7d7e --- /dev/null +++ b/docstore/55ad1cd6-7d9b-4c43-8233-7b50e0487522 @@ -0,0 +1 @@ +18°C." , config = config , ) # Print the final, user-facing response print ( response . text ) Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . I 've set the thermostat to 20°C. JavaScript This example shows how to use JavaScript/TypeScript SDK to do comopositional function calling using a manual execution loop. import { GoogleGenAI , Type } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Example Functions function get_weather_forecast ({ location }) { console . log ( `Tool Call: get_weather_forecast(location= ${ location } )` ); // TODO: Make API call console . log ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ); return { temperature : 25 , unit : "celsius" }; } function set_thermostat_temperature ({ temperature }) { console . log ( `Tool Call: set_thermostat_temperature(temperature= ${ temperature } )` , ); // TODO: Make API call console . log ( "Tool Response: {'status': 'success'}" ); return { status : "success" }; } const toolFunctions = { get_weather_forecast , set_thermostat_temperature , }; const tools = [ { functionDeclarations : [ { name : "get_weather_forecast" , description : "Gets the current weather temperature for a given location." , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , }, }, required : [ "location" ], }, }, { name : "set_thermostat_temperature" , description : "Sets the thermostat to a desired temperature." , parameters : { type : Type . OBJECT , properties : { temperature : { type : Type . NUMBER , }, }, required : [ \ No newline at end of file diff --git a/docstore/55c696f5-e0bc-4120-9b9b-9be5c7975e93 b/docstore/55c696f5-e0bc-4120-9b9b-9be5c7975e93 new file mode 100644 index 0000000000000000000000000000000000000000..e01fb45cbed3453bf1ca90f079ac96b78e0dc7ff --- /dev/null +++ b/docstore/55c696f5-e0bc-4120-9b9b-9be5c7975e93 @@ -0,0 +1 @@ +Asynchronous function calling is only supported in half-cascade audio generation. Function calling executes sequentially by default, meaning execution pauses until the results of each function call are available. This ensures sequential processing, which means you won't be able to continue interacting with the model while the functions are being run. If you don't want to block the conversation, you can tell the model to run the functions asynchronously. To do so, you first need to add a behavior to the function definitions: Python # Non-blocking function definitions turn_on_the_lights = { "name" : "turn_on_the_lights" , "behavior" : "NON_BLOCKING" } # turn_on_the_lights will run asynchronously turn_off_the_lights = { "name" : "turn_off_the_lights" } # turn_off_the_lights will still pause all interactions with the model JavaScript import { GoogleGenAI , Modality , Behavior } from '@google/genai' ; // Non-blocking function definitions const turn_on_the_lights = { name : "turn_on_the_lights" , behavior : Behavior . NON_BLOCKING } // Blocking function definitions const turn_off_the_lights = { name : "turn_off_the_lights" } const tools = [{ functionDeclarations : [ turn_on_the_lights , turn_off_the_lights ] }] NON-BLOCKING ensures the function runs asynchronously while you can continue interacting with the model. Then you need to tell the model how to behave when it receives the FunctionResponse using the scheduling parameter. It can either: Interrupt what it's doing and tell you about the response it got right away ( scheduling="INTERRUPT" ), Wait until it's finished with what it's currently doing ( scheduling="WHEN_IDLE" ), Or do nothing and use that knowledge later on in the discussion ( scheduling="SILENT" ) Python # for a non-blocking function definition, apply scheduling in the function response: function_response = types . FunctionResponse ( id = fc . id , name = fc . name , response = { "result" : "ok" , "scheduling" : "INTERRUPT" # Can also be WHEN_IDLE or \ No newline at end of file diff --git a/docstore/55c6ec47-d3d9-492a-8d9e-e0cdf5eff735 b/docstore/55c6ec47-d3d9-492a-8d9e-e0cdf5eff735 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/55c6ec47-d3d9-492a-8d9e-e0cdf5eff735 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/55cd9d14-7a98-4ef7-9cca-777a9ab71e3b b/docstore/55cd9d14-7a98-4ef7-9cca-777a9ab71e3b new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/55cd9d14-7a98-4ef7-9cca-777a9ab71e3b @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/55e4196b-9116-461b-8061-1995a1a0e9e9 b/docstore/55e4196b-9116-461b-8061-1995a1a0e9e9 new file mode 100644 index 0000000000000000000000000000000000000000..46b1ab716068a90ca8b9aaaffe42e5334bcea2c0 --- /dev/null +++ b/docstore/55e4196b-9116-461b-8061-1995a1a0e9e9 @@ -0,0 +1 @@ +Batch Mode | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Batch Mode The Gemini API's Batch Mode is designed to process large volumes of requests asynchronously at 50% of the standard cost . The target turnaround time is 24 hours, but in majority of cases, it is much quicker. Use Batch Mode for large-scale, non-urgent tasks such as data pre-processing or running evaluations where an immediate response is not required. Note: You can use Batch Mode with the Gemini API Python SDK or the REST API. Support for Batch Mode in the Gemini API JavaScript SDK is coming soon. Getting Started This section helps you get started with submitting your first requests in batch mode. Creating a batch job You have two ways to submit your requests in Batch Mode: Inline Requests : A list of GenerateContentRequest objects directly included in your batch creation request. This is suitable for smaller batches that keep the total request size under 20MB. The output returned from the model is a list of inlineResponse objects. Input File : A JSON Lines (JSONL) file where each line contains a complete GenerateContentRequest object. This method is recommended for larger requests. The output returned from the model is a JSONL file where each line is either a GenerateContentResponse or a status object. Inline requests For a small number of requests, you can directly embed the GenerateContentRequest objects within your BatchGenerateContentRequest . The following example calls the BatchGenerateContent method with inline requests: Python from google import genai from google.genai import types client = genai . Client () # A list of dictionaries, where each is a \ No newline at end of file diff --git a/docstore/55e451fd-50a9-415b-ae37-803231ee1964 b/docstore/55e451fd-50a9-415b-ae37-803231ee1964 new file mode 100644 index 0000000000000000000000000000000000000000..8d1e7e8a7a566917639b9515f69a698a37f27a8d --- /dev/null +++ b/docstore/55e451fd-50a9-415b-ae37-803231ee1964 @@ -0,0 +1 @@ +to avoid. ⛔ Negative pattern: Example Don't end haikus with a question: Haiku are fun A short and simple poem Don't you enjoy them? ✅ Positive pattern: Example Always end haikus with an assertion: Haiku are fun A short and simple poem A joy to write Consistent formatting Make sure that the structure and formatting of few-shot examples are the same to avoid responses with undesired formats. One of the primary objectives of adding few-shot examples in prompts is to show the model the response format. Therefore, it is essential to ensure a consistent format across all examples, especially paying attention to XML tags, white spaces, newlines, and example splitters. Add context You can include instructions and information in a prompt that the model needs to solve a problem, instead of assuming that the model has all of the required information. This contextual information helps the model understand the constraints and details of what you're asking for it to do. The following example asks the model to give troubleshooting guidance for a router: Prompt: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Response: A yellow blinking light on your Google Wifi router usually indicates a problem with the internet connection. Here are some steps to troubleshoot and fix the issue: 1. Restart your router and modem: - Unplug your modem from the power outlet and wait for 30 seconds. - Plug your modem back in and wait for it to fully boot up. - Unplug your Google Wifi router and wait for 30 seconds. - Plug your Google Wifi router back in and wait for it to fully boot up. ..... (gemini-2.5-flash) The response looks like generic troubleshooting information that's not specific to the router or the status of the LED indicator lights. To customize the response for the specific router, you can add to the prompt the router's troubleshooting guide as context for it to refer to when providing a response. Prompt: Answer the question \ No newline at end of file diff --git a/docstore/55e66825-5b74-40c5-bf97-6c2e17635dc9 b/docstore/55e66825-5b74-40c5-bf97-6c2e17635dc9 new file mode 100644 index 0000000000000000000000000000000000000000..6ba79c2c7031e707d89e2b23470a5d9ee375cc5c --- /dev/null +++ b/docstore/55e66825-5b74-40c5-bf97-6c2e17635dc9 @@ -0,0 +1 @@ +mcp.client.stdio import stdio_client from google import genai client = genai . Client () # Create server parameters for stdio connection server_params = StdioServerParameters ( command = "npx" , # Executable args = [ "-y" , "@philschmid/weather-mcp" ], # MCP Server env = None , # Optional environment variables ) async def run (): async with stdio_client ( server_params ) as ( read , write ): async with ClientSession ( read , write ) as session : # Prompt to get the weather for the current day in London. prompt = f "What is the weather in London in { datetime . now () . strftime ( '%Y-%m- %d ' ) } ?" # Initialize the connection between client and server await session . initialize () # Send request to the model with MCP function declarations response = await client . aio . models . generate_content ( model = "gemini-2.5-flash" , contents = prompt , config = genai . types . GenerateContentConfig ( temperature = 0 , tools = [ session ], # uses the session, will automatically call the tool # Uncomment if you **don't** want the SDK to automatically call the tool # automatic_function_calling=genai.types.AutomaticFunctionCallingConfig( # disable=True # ), ), ) print ( response . text ) # Start the asyncio event loop and run the main function asyncio . run ( run ()) JavaScript Make sure the latest version of the mcp SDK is installed on your platform of choice. npm install @modelcontextprotocol/sdk Note: JavaScript supports automatic tool calling by wrapping the client with mcpToTool . If you want to disable it, you can provide automaticFunctionCalling with disabled true . import { GoogleGenAI , FunctionCallingConfigMode , mcpToTool } from '@google/genai' ; import { Client } from "@modelcontextprotocol/sdk/client/index.js" ; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js" ; // Create server parameters for stdio connection const serverParams = new StdioClientTransport ({ command : "npx" , // Executable args : [ "-y" , "@philschmid/weather-mcp" \ No newline at end of file diff --git a/docstore/560edaaf-2ad0-4171-b156-1fb7a10d580a b/docstore/560edaaf-2ad0-4171-b156-1fb7a10d580a new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/560edaaf-2ad0-4171-b156-1fb7a10d580a @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/561082be-8ccc-48b4-9414-adae7353493e b/docstore/561082be-8ccc-48b4-9414-adae7353493e new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/561082be-8ccc-48b4-9414-adae7353493e @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/562e18f5-6bbc-4a71-89df-fdb74832a777 b/docstore/562e18f5-6bbc-4a71-89df-fdb74832a777 new file mode 100644 index 0000000000000000000000000000000000000000..8759a03a20a3177c7734cd1638fb9c60e8d9d57e --- /dev/null +++ b/docstore/562e18f5-6bbc-4a71-89df-fdb74832a777 @@ -0,0 +1 @@ +popularized by short form video apps (for example, YouTube shorts). Use this for tall objects with strong vertical orientations such as buildings, trees, waterfalls, or other similar objects. Prompt: a digital render of a massive skyscraper, modern, grand, epic with a beautiful sunset in the background (9:16 aspect ratio) Photorealistic images Different versions of the image generation model might offer a mix of artistic and photorealistic output. Use the following wording in prompts to generate more photorealistic output, based on the subject you want to generate. Note: Take these keywords as general guidance when you try to create photorealistic images. They aren't required to achieve your goal. Use case Lens type Focal lengths Additional details People (portraits) Prime, zoom 24-35mm black and white film, Film noir, Depth of field, duotone (mention two colors) Food, insects, plants (objects, still life) Macro 60-105mm High detail, precise focusing, controlled lighting Sports, wildlife (motion) Telephoto zoom 100-400mm Fast shutter speed, Action or movement tracking Astronomical, landscape (wide-angle) Wide-angle 10-24mm Long exposure times, sharp focus, long exposure, smooth water or clouds Portraits Use case Lens type Focal lengths Additional details People (portraits) Prime, zoom 24-35mm black and white film, Film noir, Depth of field, duotone (mention two colors) Using several keywords from the table, Imagen can generate the following portraits: Prompt: A woman, 35mm portrait, blue and grey duotones Model: imagen-3.0-generate-002 Prompt: A woman, 35mm portrait, film noir Model: imagen-3.0-generate-002 Objects Use case Lens type Focal lengths Additional details Food, insects, plants (objects, still life) Macro 60-105mm High detail, precise focusing, controlled lighting Using several keywords from the table, Imagen can generate the following object images: Prompt: leaf of a prayer plant, macro lens, 60mm Model: imagen-3.0-generate-002 Prompt: a plate of pasta, \ No newline at end of file diff --git a/docstore/562f4aa1-f647-402b-96fa-76c49e69362c b/docstore/562f4aa1-f647-402b-96fa-76c49e69362c new file mode 100644 index 0000000000000000000000000000000000000000..1f0c3b246331ddbf92f38a6b0613acd26fe193f7 --- /dev/null +++ b/docstore/562f4aa1-f647-402b-96fa-76c49e69362c @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/structured-output#generating-json Title: Structured output | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/5648e349-af3b-4886-b5fd-f5c0711170ea b/docstore/5648e349-af3b-4886-b5fd-f5c0711170ea new file mode 100644 index 0000000000000000000000000000000000000000..ad265efbf625954dc0d0ac1c21512df38213077e --- /dev/null +++ b/docstore/5648e349-af3b-4886-b5fd-f5c0711170ea @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/url-context#main-content Title: URL context | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/565ec6d4-68eb-4a58-b740-9449f23070f6 b/docstore/565ec6d4-68eb-4a58-b740-9449f23070f6 new file mode 100644 index 0000000000000000000000000000000000000000..0d6f2bf32b918d3f85636fdb1c53070e3d5509f6 --- /dev/null +++ b/docstore/565ec6d4-68eb-4a58-b740-9449f23070f6 @@ -0,0 +1 @@ +overlay_filename = f " { item [ 'label' ] } _ { i } _overlay.png" mask . save ( os . path . join ( output_dir , mask_filename )) # Create and save overlay composite = Image . alpha_composite ( im . convert ( 'RGBA' ), overlay ) composite . save ( os . path . join ( output_dir , overlay_filename )) print ( f "Saved mask and overlay for { item [ 'label' ] } to { output_dir } " ) # Example usage if __name__ == "__main__" : extract_segmentation_masks ( "path/to/image.png" ) Check the segmentation example in the cookbook guide for a more detailed example. An example segmentation output with objects and segmentation masks Supported image formats Gemini supports the following image format MIME types: PNG - image/png JPEG - image/jpeg WEBP - image/webp HEIC - image/heic HEIF - image/heif Capabilities All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation. Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements. Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities: Gemini 2.0 models are further trained to support enhanced object detection . Gemini 2.5 models are further trained to support enhanced segmentation in addition to object detection . Limitations and key technical information File limit Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request. Token calculation Gemini 1.5 Flash and Gemini 1.5 Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens. Gemini 2.0 Flash and Gemini 2.5 Flash/Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each \ No newline at end of file diff --git a/docstore/5676c4d5-9a57-4f14-b094-73f96f8a250c b/docstore/5676c4d5-9a57-4f14-b094-73f96f8a250c new file mode 100644 index 0000000000000000000000000000000000000000..1b590529498df147a73d1ae4a22f439d08a36cb6 --- /dev/null +++ b/docstore/5676c4d5-9a57-4f14-b094-73f96f8a250c @@ -0,0 +1 @@ +from "node:fs" ; const ai = new GoogleGenAI ({}); const base64ImageFile = fs . readFileSync ( "path/to/small-sample.jpg" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "image/jpeg" , data : base64ImageFile , }, }, { text : "Caption this image." }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go bytes , _ := os . ReadFile ( "path/to/small-sample.jpg" ) parts := [] * genai . Part { genai . NewPartFromBytes ( bytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST IMG_PATH = "/path/to/your/image1.jpg" if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "' " $( base64 $B64FLAGS $IMG_PATH ) " '" } }, {"text": "Caption this image."}, ] }] }' 2 > /dev/null You can also fetch an image from a URL, convert it to bytes, and pass it to generateContent as shown in the following examples. Python from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests . get ( image_path ) . content image = types . Part . from_bytes ( data = image_bytes , mime_type = "image/jpeg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is this image?" , image ], ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; \ No newline at end of file diff --git a/docstore/56861f64-b4fa-44d3-835e-fb65fd901c3a b/docstore/56861f64-b4fa-44d3-835e-fb65fd901c3a new file mode 100644 index 0000000000000000000000000000000000000000..4ce25c45c0956235a2a76b8ce578fbaaad6010c8 --- /dev/null +++ b/docstore/56861f64-b4fa-44d3-835e-fb65fd901c3a @@ -0,0 +1 @@ +clip" , ]), }); console . log ( response . text ); } await main (); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } defer client . DeleteFile ( ctx , file . Name ) model := client . GenerativeModel ( "gemini-2.0-flash" ) resp , err := model . GenerateContent ( ctx , genai . FileData { URI : file . URI }, genai . Text ( "Describe this audio clip" )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) REST AUDIO_PATH = "path/to/sample.mp3" MIME_TYPE = $( file -b --mime-type " ${ AUDIO_PATH } " ) NUM_BYTES = $( wc -c < " ${ AUDIO_PATH } " ) DISPLAY_NAME = AUDIO tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D " ${ tmp_header_file } " \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ AUDIO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Describe this audio clip"}, {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": \ No newline at end of file diff --git a/docstore/56a2869f-86de-4be7-81c8-ab89d4e61bcd b/docstore/56a2869f-86de-4be7-81c8-ab89d4e61bcd new file mode 100644 index 0000000000000000000000000000000000000000..1d5a02022906f295c3ad625acee2d3f5c63827ae --- /dev/null +++ b/docstore/56a2869f-86de-4be7-81c8-ab89d4e61bcd @@ -0,0 +1 @@ +Aoede -- Breezy Callirrhoe -- Easy-going Autonoe -- Bright Enceladus -- Breathy Iapetus -- Clear Umbriel -- Easy-going Algieba -- Smooth Despina -- Smooth Erinome -- Clear Algenib -- Gravelly Rasalgethi -- Informative Laomedeia -- Upbeat Achernar -- Soft Alnilam -- Firm Schedar -- Even Gacrux -- Mature Pulcherrima -- Forward Achird -- Friendly Zubenelgenubi -- Casual Vindemiatrix -- Gentle Sadachbia -- Lively Sadaltager -- Knowledgeable Sulafat -- Warm You can hear all the voice options in AI Studio . Supported languages The TTS models detect the input language automatically. They support the following 24 languages: Language BCP-47 Code Language BCP-47 Code Arabic (Egyptian) ar-EG German (Germany) de-DE English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Korean (Korea) ko-KR Portuguese (Brazil) pt-BR Russian (Russia) ru-RU Dutch (Netherlands) nl-NL Polish (Poland) pl-PL Thai (Thailand) th-TH Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Romanian (Romania) ro-RO Ukrainian (Ukraine) uk-UA Bengali (Bangladesh) bn-BD English (India) en-IN & hi-IN bundle Marathi (India) mr-IN Tamil (India) ta-IN Telugu (India) te-IN Supported models Model Single speaker Multispeaker Gemini 2.5 Flash Preview TTS ✔️ ✔️ Gemini 2.5 Pro Preview TTS ✔️ ✔️ Limitations TTS models can only receive text inputs and generate audio outputs. A TTS session has a context window limit of 32k tokens. Review Languages section for language support. What's next Try the audio generation cookbook . Gemini's Live API offers interactive audio generation options you can interleave with other modalities. For working with audio inputs , visit the Audio understanding guide. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site \ No newline at end of file diff --git a/docstore/56d6fc38-5223-4976-8c1c-1c60d5378d9b b/docstore/56d6fc38-5223-4976-8c1c-1c60d5378d9b new file mode 100644 index 0000000000000000000000000000000000000000..9670cc28c51e28ab1129c38367a73e00f33ff60c --- /dev/null +++ b/docstore/56d6fc38-5223-4976-8c1c-1c60d5378d9b @@ -0,0 +1 @@ +meal features brown rice, roasted vegetables, and chicken teriyaki. The brown rice is a healthy and complex carbohydrate that will give you sustained energy. The roasted vegetables are a great way to get your daily dose of vitamins and minerals, and the chicken teriyaki is a delicious and protein-rich option. This meal is also very easy to prepare. Simply cook the brown rice, roast the vegetables, and cook the chicken teriyaki. Once everything is cooked, divide it into meal prep containers and store them in the refrigerator. You can then grab a container and go on busy mornings! If you're looking for a healthy and delicious meal that can be easily prepped ahead of time, this meal is a great option. It's packed with nutrients and flavor, and it's sure to keep you feeling full and satisfied. Here's to healthy and delicious meal prepping! If you are having trouble getting the output you want from prompts that use media files, there are some strategies that can help you get the results you want. The following sections provide design approaches and troubleshooting tips for improving prompts that use multimodal input. You can improve your multimodal prompts by following these best practices: Prompt design fundamentals Be specific in your instructions : Craft clear and concise instructions that leave minimal room for misinterpretation. Add a few examples to your prompt: Use realistic few-shot examples to illustrate what you want to achieve. Break it down step-by-step : Divide complex tasks into manageable sub-goals, guiding the model through the process. Specify the output format : In your prompt, ask for the output to be in the format you want, like markdown, JSON, HTML and more. Put your image first for single-image prompts : While Gemini can handle image and text inputs in any order, for prompts containing a single image, it might perform better if that image (or video) is placed before the text prompt. However, for prompts that require images to be highly interleaved \ No newline at end of file diff --git a/docstore/56d85a7f-83f1-4bb3-a515-2d333d82b51a b/docstore/56d85a7f-83f1-4bb3-a515-2d333d82b51a new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/56d85a7f-83f1-4bb3-a515-2d333d82b51a @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/56df3079-2471-4b82-bb54-a74cf4e8a4ef b/docstore/56df3079-2471-4b82-bb54-a74cf4e8a4ef new file mode 100644 index 0000000000000000000000000000000000000000..2c0be12e5de5e086f593ed916f2fdecc774a45a4 --- /dev/null +++ b/docstore/56df3079-2471-4b82-bb54-a74cf4e8a4ef @@ -0,0 +1 @@ +Text generation | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Text generation The Gemini API can generate text output from various inputs, including text, images, video, and audio, leveraging Gemini models. Here's a basic example that takes a single text input: Python from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents \ No newline at end of file diff --git a/docstore/56eef8c6-335f-4d0b-b889-c709c94bf002 b/docstore/56eef8c6-335f-4d0b-b889-c709c94bf002 new file mode 100644 index 0000000000000000000000000000000000000000..b56dfbca2298dddb94eb66ba2a6b5e2a4d7109fa --- /dev/null +++ b/docstore/56eef8c6-335f-4d0b-b889-c709c94bf002 @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "' " ${ MIME1_TYPE } " '", "file_uri": ' $file1_uri '}}, { "inline_data": { "mime_type":"' " ${ MIME2_TYPE } " '", "data": "' " $IMAGE2_BASE64 " '" } } ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Object detection From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. Python from google import genai from google.genai import types from PIL import Image import json client = genai . Client () prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image . open ( "/path/to/image.png" ) config = types . GenerateContentConfig ( response_mime_type = "application/json" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , prompt ], config = config ) width , height = image . size bounding_boxes = json . loads ( response . text ) converted_bounding_boxes = [] for bounding_box in bounding_boxes : abs_y1 = int ( bounding_box [ "box_2d" ][ 0 ] / 1000 * height ) abs_x1 = int ( bounding_box [ "box_2d" ][ 1 ] / 1000 * width ) abs_y2 = int ( bounding_box [ "box_2d" ][ 2 ] / 1000 * height ) abs_x2 = int ( bounding_box [ "box_2d" ][ 3 ] / 1000 * width ) converted_bounding_boxes . append ([ abs_x1 , abs_y1 , abs_x2 , abs_y2 ]) print ( "Image size: " , width , height ) print ( "Bounding boxes:" , converted_bounding_boxes ) Note: The model also supports generating bounding boxes based on custom \ No newline at end of file diff --git a/docstore/56f0cf8f-9e96-44ec-b9aa-843df4bbb072 b/docstore/56f0cf8f-9e96-44ec-b9aa-843df4bbb072 new file mode 100644 index 0000000000000000000000000000000000000000..1c3c1b9b46e1c38e34dd8cd82807f79c808d7249 --- /dev/null +++ b/docstore/56f0cf8f-9e96-44ec-b9aa-843df4bbb072 @@ -0,0 +1 @@ +sketches, to hyper-realistic digital art. For example, the following images use the same prompt with different styles: "An [art style or creation technique] of an angular sporty electric sedan with skyscrapers in the background" Prompt: A technical pencil drawing of an angular... Prompt: A charcoal drawing of an angular... Prompt: A color pencil drawing of an angular... Prompt: A pastel painting of an angular... Prompt: A digital art of an angular... Prompt: An art deco (poster) of an angular... Image source: Each image was generated using its corresponding text prompt with the Imagen 2 model. Shapes and materials Prompt includes: "...made of..." , "...in the shape of..." One of the strengths of this technology is that you can create imagery that is otherwise difficult or impossible. For example, you can recreate your company logo in different materials and textures. Prompt: a duffle bag made of cheese Prompt: neon tubes in the shape of a bird Prompt: an armchair made of paper , studio photo, origami style Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Historical art references Prompt includes: "...in the style of..." Certain styles have become iconic over the years. The following are some ideas of historical painting or art styles that you can try. "generate an image in the style of [art period or movement] : a wind farm" Prompt: generate an image in the style of an impressionist painting : a wind farm Prompt: generate an image in the style of a renaissance painting : a wind farm Prompt: generate an image in the style of pop art : a wind farm Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Image quality modifiers Certain keywords can let the model know that you're looking for a high-quality asset. Examples of quality modifiers include the following: General Modifiers - high-quality, beautiful, stylized Photos - 4K, HDR, Studio Photo Art, Illustration - by a \ No newline at end of file diff --git a/docstore/56f2ac3c-4b32-493f-bddc-b25135201abe b/docstore/56f2ac3c-4b32-493f-bddc-b25135201abe new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/56f2ac3c-4b32-493f-bddc-b25135201abe @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/5713533d-e87e-4373-919a-19dcc8357206 b/docstore/5713533d-e87e-4373-919a-19dcc8357206 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/5713533d-e87e-4373-919a-19dcc8357206 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/5740b4d5-8f3c-4ec5-9a04-0f8d6657dafe b/docstore/5740b4d5-8f3c-4ec5-9a04-0f8d6657dafe new file mode 100644 index 0000000000000000000000000000000000000000..05f586b9ee4ba7b248a7cf2844965480ce1e46ee --- /dev/null +++ b/docstore/5740b4d5-8f3c-4ec5-9a04-0f8d6657dafe @@ -0,0 +1 @@ +Code execution | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Code execution The Gemini API provides a code execution tool that enables the model to generate and run Python code. The model can then learn iteratively from the code execution results until it arrives at a final output. You can use code execution to build applications that benefit from code-based reasoning. For example, you can use code execution to solve equations or process text. You can also use the libraries included in the code execution environment to perform more specialized tasks. Gemini is only able to execute code in Python. You can still ask Gemini to generate code in another language, but the model can't use the code execution tool to run it. Enable code execution To enable code execution, configure the code execution tool on the model. This allows the model to generate and run code. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What is the sum of the first 50 prime numbers? " "Generate and run code for the calculation, and make sure you get all 50." , config = types . GenerateContentConfig ( tools = [ types . Tool ( code_execution = types . ToolCodeExecution )] ), ) for part in response . candidates [ 0 ] . content . parts : if part . text is not None : print ( part . text ) if part . executable_code is not None : print ( part . executable_code . code ) if part . code_execution_result is not None : print ( part . code_execution_result . output ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new \ No newline at end of file diff --git a/docstore/5755e6ee-ae7c-4555-acf1-23466e91447d b/docstore/5755e6ee-ae7c-4555-acf1-23466e91447d new file mode 100644 index 0000000000000000000000000000000000000000..a97f8e9b4e6ce1d85f8c4b9bed31a9676bb7c602 --- /dev/null +++ b/docstore/5755e6ee-ae7c-4555-acf1-23466e91447d @@ -0,0 +1 @@ +Files API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Files API The Gemini family of artificial intelligence (AI) models is built to handle various types of input data, including text, images, and audio. Since these models can handle more than one type or mode of data, the Gemini models are called multimodal models or explained as having multimodal capabilities . This guide shows you how to work with media files using the Files API. The basic operations are the same for audio files, images, videos, documents, and other supported file types. For file prompting guidance, check out the File prompt guide section. Upload a file You can use the Files API to upload a media file. Always use the Files API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20 MB. The following code uploads a file and then uses the file in a call to generateContent . Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp3" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ "Describe this audio clip" , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Describe this audio \ No newline at end of file diff --git a/docstore/575e3e50-ebc1-468e-ae6a-497388569a25 b/docstore/575e3e50-ebc1-468e-ae6a-497388569a25 new file mode 100644 index 0000000000000000000000000000000000000000..83ae5cdb2af6e799ba55894dddc7bdf98a552a64 --- /dev/null +++ b/docstore/575e3e50-ebc1-468e-ae6a-497388569a25 @@ -0,0 +1 @@ +Gabon The Gambia Georgia Germany Ghana Gibraltar Greece Greenland Grenada Guam Guatemala Guernsey Guinea Guinea-Bissau Guyana Haiti Heard Island and McDonald Islands Herzegovina Honduras Hungary Iceland India Indonesia Iraq Ireland Isle of Man Israel Italy Jamaica Japan Jersey Jordan Kazakhstan Kenya Kiribati Kosovo Kyrgyzstan Kuwait Laos Latvia Lebanon Lesotho Liberia Libya Liechtenstein Lithuania Luxembourg Madagascar Malawi Malaysia Maldives Mali Malta Marshall Islands Mauritania Mauritius Mexico Micronesia Mongolia Montenegro Montserrat Morocco Mozambique Namibia Nauru Nepal Netherlands New Caledonia New Zealand Nicaragua Niger Nigeria Niue Norfolk Island North Macedonia Northern Mariana Islands Norway Oman Pakistan Palau Palestine Panama Papua New Guinea Paraguay Peru Philippines Pitcairn Islands Poland Portugal Puerto Rico Qatar Republic of Cyprus Republic of the Congo Romania Rwanda Saint Barthélemy Saint Kitts and Nevis Saint Lucia Saint Pierre and Miquelon Saint Vincent and the Grenadines Saint Helena, Ascension and Tristan da Cunha Samoa São Tomé and Príncipe Saudi Arabia Senegal Serbia Seychelles Sierra Leone Singapore Slovakia Slovenia Solomon Islands Somalia South Africa South Georgia and the South Sandwich Islands South Korea South Sudan Spain Sri Lanka Sudan Suriname Sweden Switzerland Taiwan Tajikistan Tanzania Thailand Timor-Leste Togo Tokelau Tonga Trinidad and Tobago Tunisia Türkiye Turkmenistan Turks and Caicos Islands Tuvalu Uganda Ukraine United Kingdom United Arab Emirates United States United States Minor Outlying Islands U.S. Virgin Islands Uruguay Uzbekistan Vanuatu Venezuela Vietnam Wallis and Futuna Western Sahara Yemen Zambia Zimbabwe Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its \ No newline at end of file diff --git a/docstore/57600251-76d3-4b90-a8bf-e08401e5eadc b/docstore/57600251-76d3-4b90-a8bf-e08401e5eadc new file mode 100644 index 0000000000000000000000000000000000000000..b8d7dab8b59ea83c8480687d32380faf07bab32f --- /dev/null +++ b/docstore/57600251-76d3-4b90-a8bf-e08401e5eadc @@ -0,0 +1 @@ +google.generativeai as genai # Directly create and use model objects model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ( ... ) chat = model . start_chat ( ... ) JavaScript While GoogleGenerativeAI was a central point for models and chat, other functionalities like file and cache management often required importing and instantiating entirely separate client classes. import { GoogleGenerativeAI } from "@google/generative-ai" ; import { GoogleAIFileManager , GoogleAICacheManager } from "@google/generative-ai/server" ; // For files/caching const genAI = new GoogleGenerativeAI ( "YOUR_API_KEY" ); const fileManager = new GoogleAIFileManager ( "YOUR_API_KEY" ); const cacheManager = new GoogleAICacheManager ( "YOUR_API_KEY" ); // Get a model instance, then call methods on it const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); const result = await model . generateContent (...); const chat = model . startChat (...); // Call methods on separate client objects for other services const uploadedFile = await fileManager . uploadFile (...); const cache = await cacheManager . create (...); Go The genai.NewClient function created a client, but generative model operations were typically called on a separate GenerativeModel instance obtained from this client. Other services might have been accessed via distinct packages or patterns. import ( "github.com/google/generative-ai-go/genai" "github.com/google/generative-ai-go/genai/fileman" // For files "google.golang.org/api/option" ) client , err := genai . NewClient ( ctx , option . WithAPIKey ( "YOUR_API_KEY" )) fileClient , err := fileman . NewClient ( ctx , option . WithAPIKey ( "YOUR_API_KEY" )) // Get a model instance, then call methods on it model := client . GenerativeModel ( "gemini-1.5-flash" ) resp , err := model . GenerateContent ( ... ) cs := model . StartChat () // Call methods on separate client objects for other services uploadedFile , err := fileClient . \ No newline at end of file diff --git a/docstore/576a244a-6121-4414-965c-8408bcde4c7f b/docstore/576a244a-6121-4414-965c-8408bcde4c7f new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/576a244a-6121-4414-965c-8408bcde4c7f @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/577b60d5-ed69-4e93-8de9-c5ef629ab88d b/docstore/577b60d5-ed69-4e93-8de9-c5ef629ab88d new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/577b60d5-ed69-4e93-8de9-c5ef629ab88d @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/57873b90-cb4a-4d8b-b191-03ded3322cb3 b/docstore/57873b90-cb4a-4d8b-b191-03ded3322cb3 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/57873b90-cb4a-4d8b-b191-03ded3322cb3 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/57a52098-d1bf-4f50-a2c8-9a75b5e28b1a b/docstore/57a52098-d1bf-4f50-a2c8-9a75b5e28b1a new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/57a52098-d1bf-4f50-a2c8-9a75b5e28b1a @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/57a90b7b-53d2-4b72-b973-ea564f8e658f b/docstore/57a90b7b-53d2-4b72-b973-ea564f8e658f new file mode 100644 index 0000000000000000000000000000000000000000..0d39ec227c9e73c3d3b5117917b80f593b500b4b --- /dev/null +++ b/docstore/57a90b7b-53d2-4b72-b973-ea564f8e658f @@ -0,0 +1 @@ +async function main () { const ai = new GoogleGenAI ({}); const imageUrl = "https://goo.gle/instrument-img" ; const response = await fetch ( imageUrl ); const imageArrayBuffer = await response . arrayBuffer (); const base64ImageData = Buffer . from ( imageArrayBuffer ). toString ( 'base64' ); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ { inlineData : { mimeType : 'image/jpeg' , data : base64ImageData , }, }, { text : "Caption this image." } ], }); console . log ( result . text ); } main (); Go package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Download the image. imageResp , _ := http . Get ( "https://goo.gle/instrument-img" ) imageBytes , _ := io . ReadAll ( imageResp . Body ) parts := [] * genai . Part { genai . NewPartFromBytes ( imageBytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMG_URL = "https://goo.gle/instrument-img" MIME_TYPE = $( curl -sIL " $IMG_URL " | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1 ) if [[ -z " $MIME_TYPE " || ! " $MIME_TYPE " == image/* ]] ; then MIME_TYPE = "image/jpeg" fi # Check for macOS if [[ " $( uname ) " == "Darwin" ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -b 0 ) elif [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 ) else IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -w0 ) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ \ No newline at end of file diff --git a/docstore/57c0e4f1-d5e9-4c03-a73b-f2added97e37 b/docstore/57c0e4f1-d5e9-4c03-a73b-f2added97e37 new file mode 100644 index 0000000000000000000000000000000000000000..276b272d06bde6f464a4287876b38d8d2bc17eb7 --- /dev/null +++ b/docstore/57c0e4f1-d5e9-4c03-a73b-f2added97e37 @@ -0,0 +1 @@ +Migrate to the Google GenAI SDK | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Migrate to the Google GenAI SDK Starting with the Gemini 2.0 release in late 2024, we introduced a new set of libraries called the Google GenAI SDK . It offers an improved developer experience through an updated client architecture , and simplifies the transition between developer and enterprise workflows. The Google GenAI SDK is now in General Availability (GA) across all supported platforms. If you're using one of our legacy libraries , we strongly recommend you to migrate. This guide provides before-and-after examples of migrated code to help you get started. Note: The Go examples omit imports and other boilerplate code to improve readability. Installation Before Python pip install -U -q "google-generativeai" JavaScript npm install @google/generative-ai Go go get github.com/google/generative-ai-go After Python pip install -U -q "google-genai" JavaScript npm install @google/genai Go go get google.golang.org/genai API access The old SDK implicitly handled the API client behind the scenes using a variety of ad hoc methods. This made it hard to manage the client and credentials. Now, you interact through a central Client object. This Client object acts as a single entry point for various API services (e.g., models , chats , files , tunings ), promoting consistency and simplifying credential and configuration management across different API calls. Before (Less Centralized API Access) Python The old SDK didn't explicitly use a top-level client object for most API calls. You would directly instantiate and interact with GenerativeModel objects. import \ No newline at end of file diff --git a/docstore/57f4d7b3-d0f9-465d-ba2b-e479712db56f b/docstore/57f4d7b3-d0f9-465d-ba2b-e479712db56f new file mode 100644 index 0000000000000000000000000000000000000000..00c5cdc92c5adc1f652744cb7f71ce82eb12e3dc --- /dev/null +++ b/docstore/57f4d7b3-d0f9-465d-ba2b-e479712db56f @@ -0,0 +1 @@ +( 'https://discovery.ucl.ac.uk/id/eprint/10089234/1/343019_3_art_0_py4t4l_convrt.pdf' ) . then (( response ) = > response . arrayBuffer ()); const contents = [ { text : "Summarize this document" }, { inlineData : { mimeType : 'application/pdf' , data : Buffer . from ( pdfResp ). toString ( "base64" ) } } ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "io" "net/http" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , _ := genai . NewClient ( ctx , & genai . ClientConfig { APIKey : os . Getenv ( "GEMINI_API_KEY" ), Backend : genai . BackendGeminiAPI , }) pdfResp , _ := http . Get ( "https://discovery.ucl.ac.uk/id/eprint/10089234/1/343019_3_art_0_py4t4l_convrt.pdf" ) var pdfBytes [] byte if pdfResp != nil && pdfResp . Body != nil { pdfBytes , _ = io . ReadAll ( pdfResp . Body ) pdfResp . Body . Close () } parts := [] * genai . Part { & genai . Part { InlineData : & genai . Blob { MIMEType : "application/pdf" , Data : pdfBytes , }, }, genai . NewPartFromText ( "Summarize this document" ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST DOC_URL = "https://discovery.ucl.ac.uk/id/eprint/10089234/1/343019_3_art_0_py4t4l_convrt.pdf" PROMPT = "Summarize this document" DISPLAY_NAME = "base64_pdf" # Download the PDF wget -O " ${ DISPLAY_NAME } .pdf" " ${ DOC_URL } " # Check for FreeBSD base64 and set flags accordingly if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi # Base64 encode the PDF ENCODED_PDF = $( base64 $B64FLAGS " ${ DISPLAY_NAME } .pdf" ) # Generate content using the base64 encoded PDF curl \ No newline at end of file diff --git a/docstore/5808c1e4-c4d2-40d0-82ca-b0430813cc4e b/docstore/5808c1e4-c4d2-40d0-82ca-b0430813cc4e new file mode 100644 index 0000000000000000000000000000000000000000..acef50f8402c486c348013efe146de15b88cb32b --- /dev/null +++ b/docstore/5808c1e4-c4d2-40d0-82ca-b0430813cc4e @@ -0,0 +1 @@ +Function calling with the Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Function calling with the Gemini API Function calling lets you connect models to external tools and APIs. Instead of generating text responses, the model determines when to call specific functions and provides the necessary parameters to execute real-world actions. This allows the model to act as a bridge between natural language and real-world actions and data. Function calling has 3 primary use cases: Augment Knowledge: Access information from external sources like databases, APIs, and knowledge bases. Extend Capabilities: Use external tools to perform computations and extend the limitations of the model, such as using a calculator or creating charts. Take Actions: Interact with external systems using APIs, such as scheduling appointments, creating invoices, sending emails, or controlling smart home devices. Get Weather Schedule Meeting Create Chart How function calling works Function calling involves a structured interaction between your application, the model, and external functions. Here's a breakdown of the process: Define Function Declaration: Define the function declaration in your application code. Function Declarations describe the function's name, parameters, and purpose to the model. Call LLM with function declarations: Send user prompt along with the function declaration(s) to the model. It analyzes the request and determines if a function call would be helpful. If so, it responds with a structured JSON object. Execute Function Code (Your Responsibility): The Model does not execute the function itself. It's your application's responsibility to \ No newline at end of file diff --git a/docstore/580ede41-e0be-4cf8-9000-8cc81d5d614f b/docstore/580ede41-e0be-4cf8-9000-8cc81d5d614f new file mode 100644 index 0000000000000000000000000000000000000000..696658722d20a37964165e2b0fc6d9de79b0d2b8 --- /dev/null +++ b/docstore/580ede41-e0be-4cf8-9000-8cc81d5d614f @@ -0,0 +1 @@ +main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); You can enable transcription of the audio input by sending input_audio_transcription in setup config. Python import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ], "input_audio_transcription" : {}, } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_data = Path ( "16000.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_data , mime_type = 'audio/pcm;rate=16000' ) ) async for msg in session . receive (): if msg . server_content . input_transcription : print ( 'Transcript:' , msg . server_content . input_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ], inputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( \ No newline at end of file diff --git a/docstore/5827ad52-aa54-42be-8d2a-3c0b47da7adc b/docstore/5827ad52-aa54-42be-8d2a-3c0b47da7adc new file mode 100644 index 0000000000000000000000000000000000000000..d4fb21082ac4c3c069d303936667f413973ff128 --- /dev/null +++ b/docstore/5827ad52-aa54-42be-8d2a-3c0b47da7adc @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/speech-generation#voices Title: Speech generation (text-to-speech) | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/582b107d-3eb2-4e5a-a971-22d9ae4e5346 b/docstore/582b107d-3eb2-4e5a-a971-22d9ae4e5346 new file mode 100644 index 0000000000000000000000000000000000000000..b9aae1f02a8caa7a25135d3bec800921c05dfc11 --- /dev/null +++ b/docstore/582b107d-3eb2-4e5a-a971-22d9ae4e5346 @@ -0,0 +1 @@ +( response . choices [ 0 ] . message . content ) JavaScript import fs from "fs" ; import OpenAI from "openai" ; const client = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); const audioFile = fs . readFileSync ( "/path/to/your/audio/file.wav" ); const base64Audio = Buffer . from ( audioFile ). toString ( "base64" ); async function main () { const response = await client . chat . completions . create ({ model : "gemini-2.0-flash" , messages : [ { role : "user" , content : [ { type : "text" , text : "Transcribe this audio" , }, { type : "input_audio" , input_audio : { data : base64Audio , format : "wav" , }, }, ], }, ], }); console . log ( response . choices [ 0 ]. message . content ); } main (); REST Note: If you get an Argument list too long error, the encoding of your audio file might be too long for curl. bash -c ' base64_audio=$(base64 -i "/path/to/your/audio/file.wav"); curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d "{ \"model\": \"gemini-2.0-flash\", \"messages\": [ { \"role\": \"user\", \"content\": [ { \"type\": \"text\", \"text\": \"Transcribe this audio file.\" }, { \"type\": \"input_audio\", \"input_audio\": { \"data\": \"${base64_audio}\", \"format\": \"wav\" } } ] } ] }" ' Structured output Gemini models can output JSON objects in any structure you define . Python from pydantic import BaseModel from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) class CalendarEvent ( BaseModel ): name : str date : str participants : list [ str ] completion = client . beta . chat . completions . parse ( model = "gemini-2.0-flash" , messages = [ { "role" : "system" , "content" : "Extract the event information." }, { "role" : "user" , "content" : "John and Susan are going to an AI conference on \ No newline at end of file diff --git a/docstore/582f6018-0266-4d88-a3cc-3999117f9ed6 b/docstore/582f6018-0266-4d88-a3cc-3999117f9ed6 new file mode 100644 index 0000000000000000000000000000000000000000..2437f77cb02a7dfc3b66d950f0fe4ad8777ea66f --- /dev/null +++ b/docstore/582f6018-0266-4d88-a3cc-3999117f9ed6 @@ -0,0 +1 @@ +SpeakerVoiceConfig ( speaker = 'Joe' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), types . SpeakerVoiceConfig ( speaker = 'Jane' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Puck' , ) ) ), ] ) ) ) ) data = response . candidates [ 0 ] . content . parts [ 0 ] . inline_data . data file_name = 'out.wav' wave_file ( file_name , data ) # Saves the file to current directory JavaScript import { GoogleGenAI } from '@google/genai' ; import wav from 'wav' ; async function saveWaveFile ( filename , pcmData , channels = 1 , rate = 24000 , sampleWidth = 2 , ) { return new Promise (( resolve , reject ) = > { const writer = new wav . FileWriter ( filename , { channels , sampleRate : rate , bitDepth : sampleWidth * 8 , }); writer . on ( 'finish' , resolve ); writer . on ( 'error' , reject ); writer . write ( pcmData ); writer . end (); }); } async function main () { const ai = new GoogleGenAI ({}); const prompt = `TTS the following conversation between Joe and Jane: Joe: How's it going today Jane? Jane: Not too bad, how about you?` ; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : [{ parts : [{ text : prompt }] }], config : { responseModalities : [ 'AUDIO' ], speechConfig : { multiSpeakerVoiceConfig : { speakerVoiceConfigs : [ { speaker : 'Joe' , voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Kore' } } }, { speaker : 'Jane' , voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Puck' } } } ] } } } }); const data = response . candidates ? .[ 0 ] ? . content ? . parts ? .[ 0 ] ? . inlineData ? . data ; const audioBuffer = Buffer . from ( data , 'base64' ); const fileName = 'out.wav' ; await saveWaveFile ( fileName , audioBuffer ); } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \ -H "x-goog-api-key: \ No newline at end of file diff --git a/docstore/584abdac-0cd5-4a42-b2b4-a84332cbdd7f b/docstore/584abdac-0cd5-4a42-b2b4-a84332cbdd7f new file mode 100644 index 0000000000000000000000000000000000000000..76b3241c42effba70a7ac847bf09514ad7de11cd --- /dev/null +++ b/docstore/584abdac-0cd5-4a42-b2b4-a84332cbdd7f @@ -0,0 +1 @@ +not specified, the model can choose from any of the provided function declarations. If allowed_function_names is provided as a list, the model can only choose from the functions in that list. Use this mode when you require a function call response to every prompt (if applicable). NONE : The model is prohibited from making function calls. This is equivalent to sending a request without any function declarations. Use this to temporarily disable function calling without removing your tool definitions. Python from google.genai import types # Configure function calling mode tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = "ANY" , allowed_function_names = [ "get_current_temperature" ] ) ) # Create the generation config config = types . GenerateContentConfig ( tools = [ tools ], # not defined here. tool_config = tool_config , ) JavaScript import { FunctionCallingConfigMode } from '@google/genai' ; // Configure function calling mode const toolConfig = { functionCallingConfig : { mode : FunctionCallingConfigMode . ANY , allowedFunctionNames : [ 'get_current_temperature' ] } }; // Create the generation config const config = { tools : tools , // not defined here. toolConfig : toolConfig , }; Automatic function calling (Python only) When using the Python SDK, you can provide Python functions directly as tools. The SDK automatically converts the Python function to declarations, handles the function call execution and the response cycle for you. The Python SDK then automatically: Detects function call responses from the model. Call the corresponding Python function in your code. Sends the function response back to the model. Returns the model's final text response. To use this, define your function with type hints and a docstring, and then pass the function itself (not a JSON declaration) as a tool: Python from google import genai from google.genai import types # Define the function with type hints and docstring def \ No newline at end of file diff --git a/docstore/5857df72-7764-44dd-9b7f-6b93174bca34 b/docstore/5857df72-7764-44dd-9b7f-6b93174bca34 new file mode 100644 index 0000000000000000000000000000000000000000..cc60911e84f69efe9aed5a36f728569e3615d06d --- /dev/null +++ b/docstore/5857df72-7764-44dd-9b7f-6b93174bca34 @@ -0,0 +1 @@ +image_file . read ()) . decode ( 'utf-8' ) # Getting the base64 string base64_image = encode_image ( "Path/to/agi/image.jpeg" ) response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = [ { "role" : "user" , "content" : [ { "type" : "text" , "text" : "What is in this image?" , }, { "type" : "image_url" , "image_url" : { "url" : f "data:image/jpeg;base64, { base64_image } " }, }, ], } ], ) print ( response . choices [ 0 ]) JavaScript import OpenAI from "openai" ; import fs from 'fs/promises' ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function encodeImage ( imagePath ) { try { const imageBuffer = await fs . readFile ( imagePath ); return imageBuffer . toString ( 'base64' ); } catch ( error ) { console . error ( "Error encoding image:" , error ); return null ; } } async function main () { const imagePath = "Path/to/agi/image.jpeg" ; const base64Image = await encodeImage ( imagePath ); const messages = [ { "role" : "user" , "content" : [ { "type" : "text" , "text" : "What is in this image?" , }, { "type" : "image_url" , "image_url" : { "url" : `data:image/jpeg;base64, ${ base64Image } ` }, }, ], } ]; try { const response = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : messages , }); console . log ( response . choices [ 0 ]); } catch ( error ) { console . error ( "Error calling Gemini API:" , error ); } } main (); REST bash -c ' base64_image=$(base64 -i "Path/to/agi/image.jpeg"); curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d "{ \"model\": \"gemini-2.0-flash\", \"messages\": [ { \"role\": \"user\", \"content\": [ { \"type\": \"text\", \"text\": \"What is in this image?\" }, { \"type\": \"image_url\", \"image_url\": { \"url\": \"data:image/jpeg;base64,${base64_image}\" } } ] } ] }" ' \ No newline at end of file diff --git a/docstore/58733530-36ff-408b-b9f0-36e38f869054 b/docstore/58733530-36ff-408b-b9f0-36e38f869054 new file mode 100644 index 0000000000000000000000000000000000000000..c95ce8529f78ed9807c80ac97da2c9c530df9edf --- /dev/null +++ b/docstore/58733530-36ff-408b-b9f0-36e38f869054 @@ -0,0 +1 @@ +GenerateContentConfig { ResponseModalities : [] string { "TEXT" , "IMAGE" }, } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash-preview-image-generation" , genai . Text ( "Hi, can you create a 3d rendered image of a pig " + "with wings and a top hat flying over a happy " + "futuristic scifi city with lots of greenery?" ), config , ) for _ , part := range result . Candidates [ 0 ]. Content . Parts { if part . Text != "" { fmt . Println ( part . Text ) } else if part . InlineData != nil { imageBytes := part . InlineData . Data outputFilename := "gemini_generated_image.png" _ = os . WriteFile ( outputFilename , imageBytes , 0644 ) } } } REST curl -s -X POST "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-preview-image-generation:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts": [ {"text": "Hi, can you create a 3d rendered image of a pig with wings and a top hat flying over a happy futuristic scifi city with lots of greenery?"} ] }], "generationConfig":{"responseModalities":["TEXT","IMAGE"]} }' \ | grep -o '"data": "[^"]*"' \ | cut -d '"' -f4 \ | base64 --decode > gemini-native-image.png AI-generated image of a fantastical flying pig Image editing (text-and-image-to-image) To perform image editing, add an image as input. The following example demonstrates uploading base64 encoded images. For multiple images and larger payloads, check the image input section. Python from google import genai from google.genai import types from PIL import Image from io import BytesIO import PIL.Image image = PIL . Image . open ( '/path/to/image.png' ) client = genai . Client () text_input = ( 'Hi, This is a picture of me.' 'Can you add a llama next to me?' ,) response = client . models . generate_content ( model = "gemini-2.0-flash-preview-image-generation" , contents = [ text_input , image ], config = types . GenerateContentConfig ( response_modalities = [ 'TEXT' \ No newline at end of file diff --git a/docstore/5879acc7-9f57-428f-91d0-4381f8c42e98 b/docstore/5879acc7-9f57-428f-91d0-4381f8c42e98 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/5879acc7-9f57-428f-91d0-4381f8c42e98 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/58903fb8-780d-4bde-ada8-56faba020b91 b/docstore/58903fb8-780d-4bde-ada8-56faba020b91 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/58903fb8-780d-4bde-ada8-56faba020b91 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/58a049fa-bfcd-4f60-8f0b-d96dee64a618 b/docstore/58a049fa-bfcd-4f60-8f0b-d96dee64a618 new file mode 100644 index 0000000000000000000000000000000000000000..46d53c21bdcab42ceff886736b536f59f9b05598 --- /dev/null +++ b/docstore/58a049fa-bfcd-4f60-8f0b-d96dee64a618 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/libraries#new-libraries Title: Gemini API libraries | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/58a4fa0e-dbd2-4a78-bbad-58cbc4c667ea b/docstore/58a4fa0e-dbd2-4a78-bbad-58cbc4c667ea new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/58a4fa0e-dbd2-4a78-bbad-58cbc4c667ea @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/58a9de69-21f2-4ac6-95b4-0c2acba5c735 b/docstore/58a9de69-21f2-4ac6-95b4-0c2acba5c735 new file mode 100644 index 0000000000000000000000000000000000000000..b9a0fd4b25842077eb6b6b7137ead6b05cace053 --- /dev/null +++ b/docstore/58a9de69-21f2-4ac6-95b4-0c2acba5c735 @@ -0,0 +1 @@ +record which response you preferred. This is the core of the feedback we're collecting. Usage Details: This includes information about which model generated the response and other technical and operational details about your usage of this feature. Your Privacy We take your privacy seriously. Google takes steps to protect your privacy as part of this process. This includes disconnecting this data from your Google Account, API key, and Cloud project before reviewers see or annotate it. Do not submit feedback on conversations that include sensitive, confidential, or personal information. Opting Out You'll have the option to skip the Inline Preference Voting when it appears. Thank you for helping us improve Google AI Studio! Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-03-24 UTC. \ No newline at end of file diff --git a/docstore/58b6eafa-e849-4b13-bc58-fa7d4fa3a4ba b/docstore/58b6eafa-e849-4b13-bc58-fa7d4fa3a4ba new file mode 100644 index 0000000000000000000000000000000000000000..ba9fb868f99e81b20779165b803150afffeabaec --- /dev/null +++ b/docstore/58b6eafa-e849-4b13-bc58-fa7d4fa3a4ba @@ -0,0 +1 @@ +Generate video using Veo | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Generate video using Veo The Gemini API provides access to Veo 2 , Google's most capable video generation model to date. Veo generates videos in a wide range of cinematic and visual styles, capturing prompt nuance to render intricate details consistently across frames. This guide will help you get started with Veo using the Gemini API. For video prompting guidance, check out the Veo prompt guide section. Note: Veo is a paid feature and will not run in the Free tier. Visit the Pricing page for more details. Before you begin Before calling the Gemini API, ensure you have your SDK of choice installed, and a Gemini API key configured and ready to use. To use Veo with the Google Gen AI SDKs, ensure that you have one of the following versions installed: Python v1.10.0 or later TypeScript and JavaScript v0.8.0 or later Go v1.0.0 or later Generate videos This section provides code examples for generating videos using text prompts and using images . Generate from text You can use the following code to generate videos with Veo: Python import time from google import genai from google.genai import types client = genai . Client () operation = client . models . generate_videos ( model = "veo-2.0-generate-001" , prompt = "Panning wide shot of a calico kitten sleeping in the sunshine" , config = types . GenerateVideosConfig ( person_generation = "dont_allow" , # "dont_allow" or "allow_adult" aspect_ratio = "16:9" , # "16:9" or "9:16" ), ) while not operation . done : time . sleep ( 20 ) operation = client . operations . get ( operation ) for n , generated_video in enumerate ( \ No newline at end of file diff --git a/docstore/58d93eb3-e6b0-4118-b286-e659d559b6d5 b/docstore/58d93eb3-e6b0-4118-b286-e659d559b6d5 new file mode 100644 index 0000000000000000000000000000000000000000..dd1226540612f04fb3f971567b47c61067071189 --- /dev/null +++ b/docstore/58d93eb3-e6b0-4118-b286-e659d559b6d5 @@ -0,0 +1 @@ +This example shows you how to specify a subject description. Subject description Prompt Generated output The description can include a subject, or multiple subjects and actions. Here, our subject is "white concrete apartment building." An architectural rendering of a white concrete apartment building with flowing organic shapes, seamlessly blending with lush greenery and futuristic elements Context This example shows you how to specify context. Context Prompt Generated output The background or context in which the subject will be placed is very important. Try placing your subject in a variety of backgrounds like on a busy street, or in outer space. A satellite floating through outer space with the moon and some stars in the background. Action This example shows you how to specify action. Action Prompt Generated output What is the subject doing like walking, running, or turning their head. A wide shot of a woman walking along the beach, looking content and relaxed towards the horizon at sunset. Style This example shows you how to specify style. Style Prompt Generated output You can add keywords to improve generation quality and steer it closer to intended style, such as shallow depth of field, movie still, minimalistic, surreal, vintage, futuristic, or double-exposure. Film noir style, man and woman walk on the street, mystery, cinematic, black and white. Camera motion This example shows you how to specify camera motion. Camera motion Prompt Generated output Options for camera motion include POV shot, aerial view, tracking drone view, or tracking shot. A POV shot from a vintage car driving in the rain, Canada at night, cinematic. Composition This example shows you how to specify composition. Composition Prompt Generated output How the shot is framed (wide shot, close-up, low angle). Extreme close-up of a an eye with city reflected in it. Create a video of a wide shot of surfer walking on a beach with a surfboard, beautiful sunset, cinematic. Ambiance This example \ No newline at end of file diff --git a/docstore/58ddd249-8419-4509-a2aa-ba11a88df8de b/docstore/58ddd249-8419-4509-a2aa-ba11a88df8de new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/58ddd249-8419-4509-a2aa-ba11a88df8de @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/58fbddcb-2410-44a6-91a5-f67b73b6ef67 b/docstore/58fbddcb-2410-44a6-91a5-f67b73b6ef67 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/58fbddcb-2410-44a6-91a5-f67b73b6ef67 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/58fe476a-3e32-4843-ad4f-8d7a49dde74d b/docstore/58fe476a-3e32-4843-ad4f-8d7a49dde74d new file mode 100644 index 0000000000000000000000000000000000000000..4542d68c23b70ac5014267f60edbcffd2138877f --- /dev/null +++ b/docstore/58fe476a-3e32-4843-ad4f-8d7a49dde74d @@ -0,0 +1 @@ +, headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Streaming can also be used for multi-turn conversations. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message_stream ( "I have 2 dogs in my house." ) for chunk in response : print ( chunk . text , end = "" ) response = chat . send_message_stream ( "How many paws are in my house?" ) for chunk in response : print ( chunk . text , end = "" ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const stream1 = await chat . sendMessageStream ({ message : "I have 2 dogs in my house." , }); for await ( const chunk of stream1 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } const stream2 = await chat . sendMessageStream ({ message : "How many paws are in my house?" , }); for await ( const chunk of stream2 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . \ No newline at end of file diff --git a/docstore/59042dc2-1160-4dcb-9a76-1d5cdf589864 b/docstore/59042dc2-1160-4dcb-9a76-1d5cdf589864 new file mode 100644 index 0000000000000000000000000000000000000000..63d9f8f1c5d6f296f9334ea760bebfcc6dc4a24c --- /dev/null +++ b/docstore/59042dc2-1160-4dcb-9a76-1d5cdf589864 @@ -0,0 +1 @@ +Gemini thinking | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini thinking The Gemini 2.5 series models use an internal "thinking process" that significantly improves their reasoning and multi-step planning abilities, making them highly effective for complex tasks such as coding, advanced mathematics, and data analysis. This guide shows you how to work with Gemini's thinking capabilities using the Gemini API. Before you begin Ensure you use a supported 2.5 series model for thinking. You might find it beneficial to explore these models in AI Studio before diving into the API: Try Gemini 2.5 Flash in AI Studio Try Gemini 2.5 Pro in AI Studio Try Gemini 2.5 Flash-Lite Preview in AI Studio Generating content with thinking Initiating a request with a thinking model is similar to any other content generation request. The key difference lies in specifying one of the models with thinking support in the model field, as demonstrated in the following text generation example: Python from google import genai client = genai . Client () prompt = "Explain the concept of Occam's Razor and provide a simple, everyday example." response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = prompt ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const prompt = "Explain the concept of Occam's Razor and provide a simple, everyday example." ; const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : prompt , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" \ No newline at end of file diff --git a/docstore/59866869-0eb3-469d-ad7b-3862cd987a92 b/docstore/59866869-0eb3-469d-ad7b-3862cd987a92 new file mode 100644 index 0000000000000000000000000000000000000000..37fa730aa9280f3cac34df0c8f8ecdd2b308e691 --- /dev/null +++ b/docstore/59866869-0eb3-469d-ad7b-3862cd987a92 @@ -0,0 +1 @@ +operation . response . generated_videos ): client . files . download ( file = generated_video . video ) generated_video . video . save ( f "video { n } .mp4" ) # save the video JavaScript import { GoogleGenAI } from "@google/genai" ; import { createWriteStream } from "fs" ; import { Readable } from "stream" ; const ai = new GoogleGenAI ({}); async function main () { let operation = await ai . models . generateVideos ({ model : "veo-2.0-generate-001" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , config : { personGeneration : "dont_allow" , aspectRatio : "16:9" , }, }); while ( ! operation . done ) { await new Promise (( resolve ) = > setTimeout ( resolve , 10000 )); operation = await ai . operations . getVideosOperation ({ operation : operation , }); } operation . response ? . generatedVideos ? . forEach ( async ( generatedVideo , n ) = > { const resp = await fetch ( ` ${ generatedVideo . video ? . uri } & key=GEMINI_API_KEY` ); // append your API key const writer = createWriteStream ( `video ${ n } .mp4` ); Readable . fromWeb ( resp . body ). pipe ( writer ); }); } main (); Go package main import ( "context" "fmt" "os" "time" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } videoConfig := & genai . GenerateVideosConfig { AspectRatio : "16:9" , PersonGeneration : "dont_allow" , } operation , _ := client . Models . GenerateVideos ( ctx , "veo-2.0-generate-001" , "Panning wide shot of a calico kitten sleeping in the sunshine" , nil , videoConfig , ) for ! operation . Done { time . Sleep ( 20 * time . Second ) operation , _ = client . Operations . GetVideosOperation ( ctx , operation , nil ) } for n , video := range operation . Response . GeneratedVideos { client . Files . Download ( ctx , video . Video , nil ) fname := fmt . Sprintf ( "video_%d.mp4" , n ) _ = os . WriteFile ( fname , video . Video . VideoBytes , 0644 ) } } REST # \ No newline at end of file diff --git a/docstore/59b03053-0404-4ad8-90da-f92c699ea953 b/docstore/59b03053-0404-4ad8-90da-f92c699ea953 new file mode 100644 index 0000000000000000000000000000000000000000..aa09fa8779a782eb0f4519da995c2b766869468f --- /dev/null +++ b/docstore/59b03053-0404-4ad8-90da-f92c699ea953 @@ -0,0 +1 @@ +genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } Refer to timestamps You can refer to specific sections of an audio file using timestamps of the form MM:SS . For example, the following prompt requests a transcript that Starts at 2 minutes 30 seconds from the beginning of the file. Ends at 3 minutes 29 seconds from the beginning of the file. Python # Create a prompt containing timestamps. prompt = "Provide a transcript of the speech from 02:30 to 03:29." JavaScript // Create a prompt containing timestamps. const prompt = "Provide a transcript of the speech from 02:30 to 03:29." Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } localAudioPath := "/path/to/sample.mp3" uploadedFile , _ := client . Files . UploadFromPath ( ctx , localAudioPath , nil , ) parts := [] * genai . Part { genai . NewPartFromText ( "Provide a transcript of the speech " + "between the timestamps 02:30 and 03:29." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } Count tokens Call the countTokens method to get a count of the number of tokens in an audio file. For example: Python response = client . models . count_tokens ( model = 'gemini-2.5-flash' , contents = [ myfile ] ) print ( response ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); const myfile = await ai \ No newline at end of file diff --git a/docstore/59b5afdf-636e-4e66-836f-44ea2f44e58c b/docstore/59b5afdf-636e-4e66-836f-44ea2f44e58c new file mode 100644 index 0000000000000000000000000000000000000000..bdac38d149644927b7d126aee3930efd52a696eb --- /dev/null +++ b/docstore/59b5afdf-636e-4e66-836f-44ea2f44e58c @@ -0,0 +1 @@ +turn_off_the_lights_schema = { 'name' : 'turn_off_the_lights' } prompt = """ Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? """ tools = [ { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} ] await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Light control schemas const turnOnTheLightsSchema = { name : 'turn_on_the_lights' }; const turnOffTheLightsSchema = { name : 'turn_off_the_lights' }; const prompt = ` Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? ` ; const tools = [ { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } ]; await run ( prompt , tools = tools , modality = "AUDIO" ) Function calling modes The Gemini API lets you control how the model uses the provided tools (function declarations). Specifically, you can set the mode within the. function_calling_config . AUTO (Default) : The model decides whether to generate a natural language response or suggest a function call based on the prompt and context. This is the most flexible mode and recommended for most scenarios. ANY : The model is constrained to always predict a function call and guarantees function schema adherence. If allowed_function_names is not specified, the model can choose from any of the provided function declarations. If allowed_function_names is provided as a list, the model can only choose from the functions in that list. Use this mode when you require a function call response to every prompt (if applicable). NONE : The model is prohibited from making function calls. This is equivalent to sending a request without any function declarations. Use this to temporarily disable function calling without removing your tool definitions. Python from google.genai import types # Configure function calling mode tool_config = types . ToolConfig ( \ No newline at end of file diff --git a/docstore/59d4554a-6a12-4a20-89c1-526d6e9952e3 b/docstore/59d4554a-6a12-4a20-89c1-526d6e9952e3 new file mode 100644 index 0000000000000000000000000000000000000000..c2369ca5049154f630fe926e06160c0364720f7c --- /dev/null +++ b/docstore/59d4554a-6a12-4a20-89c1-526d6e9952e3 @@ -0,0 +1 @@ +const turns = await handleTurn (); // Combine audio data strings and save as wave file const combinedAudio = turns . reduce (( acc , turn ) = > { if ( turn . data ) { const buffer = Buffer . from ( turn . data , 'base64' ); const intArray = new Int16Array ( buffer . buffer , buffer . byteOffset , buffer . byteLength / Int16Array . BYTES_PER_ELEMENT ); return acc . concat ( Array . from ( intArray )); } return acc ; }, []); const audioBuffer = new Int16Array ( combinedAudio ); const wf = new WaveFile (); wf . fromScratch ( 1 , 24000 , '16' , audioBuffer ); // output is 24kHz fs . writeFileSync ( 'audio.wav' , wf . toBuffer ()); session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); What's next Read the full Live API Capabilities guide for key capabilities and configurations; including Voice Activity Detection and native audio features. Read the Tool use guide to learn how to integrate Live API with tools and function calling. Read the Session management guide for managing long running conversations. Read the Ephemeral tokens guide for secure authentication in client-to-server applications. For more information about the underlying WebSockets API, see the WebSockets API reference . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/59de2341-1cc5-48e9-acf1-9ab7f343a783 b/docstore/59de2341-1cc5-48e9-acf1-9ab7f343a783 new file mode 100644 index 0000000000000000000000000000000000000000..0a87c36dd70c54ff062f2d96388cf5344fb768a1 --- /dev/null +++ b/docstore/59de2341-1cc5-48e9-acf1-9ab7f343a783 @@ -0,0 +1 @@ +process the response and check for Function Call, if Yes : Extract the name and args of the function and execute the corresponding function in your application. No: The model has provided a direct text response to the prompt (this flow is less emphasized in the example but is a possible outcome). Create User friendly response: If a function was executed, capture the result and send it back to the model in a subsequent turn of the conversation. It will use the result to generate a final, user-friendly response that incorporates the information from the function call. This process can be repeated over multiple turns, allowing for complex interactions and workflows. The model also supports calling multiple functions in a single turn ( parallel function calling ) and in sequence ( compositional function calling ). Step 1: Define a function declaration Define a function and its declaration within your application code that allows users to set light values and make an API request. This function could call external services or APIs. Python # Define a function that the model can call to control smart lights set_light_values_declaration = { "name" : "set_light_values" , "description" : "Sets the brightness and color temperature of a light." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "integer" , "description" : "Light level from 0 to 100. Zero is off and 100 is full brightness" , }, "color_temp" : { "type" : "string" , "enum" : [ "daylight" , "cool" , "warm" ], "description" : "Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`." , }, }, "required" : [ "brightness" , "color_temp" ], }, } # This is the actual function that would be called based on the model's suggestion def set_light_values ( brightness : int , color_temp : str ) - > dict [ str , int | str ]: """Set the brightness and color temperature of a room light. (mock API). Args: brightness: Light level from 0 to 100. Zero is off and 100 is full \ No newline at end of file diff --git a/docstore/59ed9ca4-3fc6-4355-a66c-3fed53c1b0cc b/docstore/59ed9ca4-3fc6-4355-a66c-3fed53c1b0cc new file mode 100644 index 0000000000000000000000000000000000000000..ec6cba9f5d0ceb3b74c56797939372d30da827c9 --- /dev/null +++ b/docstore/59ed9ca4-3fc6-4355-a66c-3fed53c1b0cc @@ -0,0 +1 @@ += "gemini-2.0-flash" , contents = """Generate a short transcript around 100 words that reads like it was clipped from a podcast by excited herpetologists. The hosts names are Dr. Anya and Liam.""" ) . text response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = transcript , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( multi_speaker_voice_config = types . MultiSpeakerVoiceConfig ( speaker_voice_configs = [ types . SpeakerVoiceConfig ( speaker = 'Dr. Anya' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), types . SpeakerVoiceConfig ( speaker = 'Liam' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Puck' , ) ) ), ] ) ) ) ) # ...Code to stream or save the output JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const transcript = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Generate a short transcript around 100 words that reads like it was clipped from a podcast by excited herpetologists. The hosts names are Dr. Anya and Liam." , }) const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : transcript , config : { responseModalities : [ 'AUDIO' ], speechConfig : { multiSpeakerVoiceConfig : { speakerVoiceConfigs : [ { speaker : "Dr. Anya" , voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" }, } }, { speaker : "Liam" , voiceConfig : { prebuiltVoiceConfig : { voiceName : "Puck" }, } } ] } } } }); } // ..JavaScript code for exporting .wav file for output audio await main (); Voice options TTS models support the following 30 voice options in the voice_name field: Zephyr -- Bright Puck -- Upbeat Charon -- Informative Kore -- Firm Fenrir -- Excitable Leda -- Youthful Orus -- Firm \ No newline at end of file diff --git a/docstore/59f53e1d-3a7e-4baa-aac2-e0e28409970f b/docstore/59f53e1d-3a7e-4baa-aac2-e0e28409970f new file mode 100644 index 0000000000000000000000000000000000000000..ea67dfb950f5bf1f1ddc040da181eb1accb5974b --- /dev/null +++ b/docstore/59f53e1d-3a7e-4baa-aac2-e0e28409970f @@ -0,0 +1 @@ +breakdown:" ) for detail in usage . response_tokens_details : match detail : case types . ModalityTokenCount ( modality = modality , token_count = count ): print ( f " { modality } : { count } " ) JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . usageMetadata ) { console . debug ( 'Used %s tokens in total. Response token breakdown:\n' , turn . usageMetadata . totalTokenCount ); for ( const detail of turn . usageMetadata . responseTokensDetails ) { console . debug ( '%s\n' , detail ); } } } Media resolution You can specify the media resolution for the input media by setting the mediaResolution field as part of the session configuration: Python from google.genai import types config = { "response_modalities" : [ "AUDIO" ], "media_resolution" : types . MediaResolution . MEDIA_RESOLUTION_LOW , } JavaScript import { GoogleGenAI , Modality , MediaResolution } from '@google/genai' ; const config = { responseModalities : [ Modality . TEXT ], mediaResolution : MediaResolution . MEDIA_RESOLUTION_LOW , }; Limitations Consider the following limitations of the Live API when you plan your project. Response modalities You can only set one response modality ( TEXT or AUDIO ) per session in the session configuration. Setting both results in a config error message. This means that you can configure the model to respond with either text or audio, but not both in the same session. Client authentication The Live API only provides server-to-server authentication by default. If you're implementing your Live API application using a client-to-server approach , you need to use ephemeral tokens to mitigate security risks. Session duration Audio-only sessions are limited to 15 minutes, and audio plus video sessions are limited to 2 minutes. However, you can configure different session management techniques for unlimited extensions on session duration. Context window A session has a context window limit of: 128k tokens for native audio output models 32k \ No newline at end of file diff --git a/docstore/5a1ea974-ce35-4de3-83b9-42f4a8cde148 b/docstore/5a1ea974-ce35-4de3-83b9-42f4a8cde148 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/5a1ea974-ce35-4de3-83b9-42f4a8cde148 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/5a2fe474-d59e-4184-b8a7-0e0c53689002 b/docstore/5a2fe474-d59e-4184-b8a7-0e0c53689002 new file mode 100644 index 0000000000000000000000000000000000000000..1792d386361c24450916b97ea69fff7069eaf586 --- /dev/null +++ b/docstore/5a2fe474-d59e-4184-b8a7-0e0c53689002 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/api-key#main-content Title: Using Gemini API keys | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/5a3468d2-4e69-421c-92e4-4596d739eb34 b/docstore/5a3468d2-4e69-421c-92e4-4596d739eb34 new file mode 100644 index 0000000000000000000000000000000000000000..f768002e22e546af8fbd249f6201ab1a1006d078 --- /dev/null +++ b/docstore/5a3468d2-4e69-421c-92e4-4596d739eb34 @@ -0,0 +1 @@ +const data = response . candidates ? .[ 0 ] ? . content ? . parts ? .[ 0 ] ? . inlineData ? . data ; const audioBuffer = Buffer . from ( data , 'base64' ); const fileName = 'out.wav' ; await saveWaveFile ( fileName , audioBuffer ); } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -X POST \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts":[{ "text": "Say cheerfully: Have a wonderful day!" }] }], "generationConfig": { "responseModalities": ["AUDIO"], "speechConfig": { "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Kore" } } } }, "model": "gemini-2.5-flash-preview-tts", }' | jq -r '.candidates[0].content.parts[0].inlineData.data' | \ base64 --decode >out.pcm # You may need to install ffmpeg. ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm out.wav Multi-speaker text-to-speech For multi-speaker audio, you'll need a MultiSpeakerVoiceConfig object with each speaker (up to 2) configured as a SpeakerVoiceConfig . You'll need to define each speaker with the same names used in the prompt : Python from google import genai from google.genai import types import wave # Set up the wave file to save the output: def wave_file ( filename , pcm , channels = 1 , rate = 24000 , sample_width = 2 ): with wave . open ( filename , "wb" ) as wf : wf . setnchannels ( channels ) wf . setsampwidth ( sample_width ) wf . setframerate ( rate ) wf . writeframes ( pcm ) client = genai . Client () prompt = """TTS the following conversation between Joe and Jane: Joe: How's it going today Jane? Jane: Not too bad, how about you?""" response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = prompt , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( multi_speaker_voice_config = types . MultiSpeakerVoiceConfig ( speaker_voice_configs = [ types . \ No newline at end of file diff --git a/docstore/5a439d81-600f-4d98-a9a0-72ef5245236e b/docstore/5a439d81-600f-4d98-a9a0-72ef5245236e new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/5a439d81-600f-4d98-a9a0-72ef5245236e @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/5a568ff1-e5c9-4316-8f15-b8095b84c0ce b/docstore/5a568ff1-e5c9-4316-8f15-b8095b84c0ce new file mode 100644 index 0000000000000000000000000000000000000000..eec969db3a5d75d1a5e593577cc3ea5cb6d88ea3 --- /dev/null +++ b/docstore/5a568ff1-e5c9-4316-8f15-b8095b84c0ce @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models#gemini-2.0-flash Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/5a5cc579-02a2-4876-8cba-9567b391a019 b/docstore/5a5cc579-02a2-4876-8cba-9567b391a019 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/5a5cc579-02a2-4876-8cba-9567b391a019 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/5a68f392-7715-485e-9fed-5ef281093d20 b/docstore/5a68f392-7715-485e-9fed-5ef281093d20 new file mode 100644 index 0000000000000000000000000000000000000000..36b0f0f8a4df60acd9dd94249f5fced4282af350 --- /dev/null +++ b/docstore/5a68f392-7715-485e-9fed-5ef281093d20 @@ -0,0 +1 @@ +Get started with Live API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Get started with Live API Preview: The Live API is in preview. The Live API enables low-latency, real-time voice and video interactions with Gemini. It processes continuous streams of audio, video, or text to deliver immediate, human-like spoken responses, creating a natural conversational experience for your users. Live API offers a comprehensive set of features such as Voice Activity Detection , tool use and function calling , session management (for managing long running conversations) and ephemeral tokens (for secure client-sided authentication). This page gets you up and running with examples and basic code samples. Example applications Check out the following example applications that illustrate how to use Live API for end-to-end use cases: Live audio starter app on AI Studio, using JavaScript libraries to connect to Live API and stream bidirectional audio through your microphone and speakers. Live API Python cookbook using Pyaudio that connects to Live API. Partner integrations If you prefer a simpler development process, you can use Daily or LiveKit . These are third-party partner platforms that have already integrated the Gemini Live API over the WebRTC protocol to streamline the development of real-time audio and video applications. Before you begin building There are two important decisions to make before you begin building with the Live API: choosing a model and choosing an implementation approach. Choose an audio generation architecture If you're building an audio-based use case, your choice of model determines the audio generation architecture \ No newline at end of file diff --git a/docstore/5a7039b9-485d-47c7-a95e-20841716e8b7 b/docstore/5a7039b9-485d-47c7-a95e-20841716e8b7 new file mode 100644 index 0000000000000000000000000000000000000000..7854fde30b22154bcc71df1cbbdf87cf29b5e25d --- /dev/null +++ b/docstore/5a7039b9-485d-47c7-a95e-20841716e8b7 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/usage-policies#main-content Title: Additional usage policies | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/5a7f73ef-8faa-4707-bef4-a79fc74b6ca5 b/docstore/5a7f73ef-8faa-4707-bef4-a79fc74b6ca5 new file mode 100644 index 0000000000000000000000000000000000000000..39af00a0c9a9d748f5b17ca6db1ebff5b823541e --- /dev/null +++ b/docstore/5a7f73ef-8faa-4707-bef4-a79fc74b6ca5 @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " Delete uploaded files Files are automatically deleted after 48 hours. You can also manually delete an uploaded file: Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) client . files . delete ( name = myfile . name ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; await ai . files . delete ({ name : fileName }); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } client . DeleteFile ( ctx , file . Name ) REST curl --request "DELETE" https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " Usage info You can use the Files API to upload and interact with media files. The Files API lets you store up to 20 GB of files per project, with a per-file maximum size of 2 GB. Files are stored for 48 hours. During that time, you can use the API to get metadata about the files, but you can't download the files. The Files API is available at no cost in all regions where the Gemini API is available. File prompting strategies This section provides guidance and best practices for using media files with prompts for the Gemini API. Being able to use various types of data in your prompts gives you more flexibility in terms of what tasks you can tackle with the Gemini API. For example, you can send the model a photo of a delicious meal and ask it to write a short blog about the meal. Prompt Response Write a short, engaging blog post based on this picture. It should include a description of the meal in the photo and talk about my journey meal prepping. Meal prepping is a great way to save time and money, and it can also help you to eat healthier. This meal is a great example of a healthy and delicious meal that can be easily prepped ahead of time. This \ No newline at end of file diff --git a/docstore/5a8ed099-3e52-498a-9c86-91e558b57122 b/docstore/5a8ed099-3e52-498a-9c86-91e558b57122 new file mode 100644 index 0000000000000000000000000000000000000000..9139684952e56c77fdce37ab354efce80520ded1 --- /dev/null +++ b/docstore/5a8ed099-3e52-498a-9c86-91e558b57122 @@ -0,0 +1 @@ +dict ). When possible, the SDK will parse the returned JSON, and return the result in response.parsed . If you provided a pydantic class as the schema the SDK will convert that JSON to an instance of the class. from google import genai from pydantic import BaseModel client = genai . Client () class CountryInfo ( BaseModel ): name : str population : int capital : str continent : str major_cities : list [ str ] gdp : int official_language : str total_area_sq_mi : int response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'Give me information of the United States.' , config = { 'response_mime_type' : 'application/json' , 'response_schema' : CountryInfo , }, ) response . parsed JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "List a few popular cookie recipes." , config : { responseMimeType : "application/json" , responseSchema : { type : "array" , items : { type : "object" , properties : { recipeName : { type : "string" }, ingredients : { type : "array" , items : { type : "string" } }, }, required : [ "recipeName" , "ingredients" ], }, }, }, }); console . log ( response . text ); Files Upload Upload a file: Before Python import requests import pathlib import google.generativeai as genai # Download file response = requests . get ( 'https://storage.googleapis.com/generativeai-downloads/data/a11.txt' ) pathlib . Path ( 'a11.txt' ) . write_text ( response . text ) file = genai . upload_file ( path = 'a11.txt' ) model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ([ 'Can you summarize this file:' , my_file ]) print ( response . text ) After Python import requests import pathlib from google import genai client = genai . Client () # Download file response = requests . get ( 'https://storage.googleapis.com/generativeai-downloads/data/a11.txt' ) \ No newline at end of file diff --git a/docstore/5a8f55a7-fd5a-4af5-a8bb-b3b08cd3179f b/docstore/5a8f55a7-fd5a-4af5-a8bb-b3b08cd3179f new file mode 100644 index 0000000000000000000000000000000000000000..c7d99b48acdb29ebe1cdd75df52d7215dd4d0ab1 --- /dev/null +++ b/docstore/5a8f55a7-fd5a-4af5-a8bb-b3b08cd3179f @@ -0,0 +1 @@ +- Zsh Zsh is a common Linux and macOS terminal configuration. You can check if you have a configuration file for it by running the following command: ~/.zshrc If the response is "No such file or directory", you will need to create this file and open it by running the following commands, or use bash : touch ~/.zshrc open ~/.zshrc Next, you need to set your API key by adding the following export command: export GEMINI_API_KEY = After saving the file, apply the changes by running: source ~/.zshrc Windows Search for "Environment Variables" in the system settings Edit either "User variables" (for current user) or "System variables" (for all users - use with caution). Create the variable and add export GEMINI_API_KEY=your_key_here Apply the changes Providing API key explicitly In some cases, you may want to explicitly provide an API key. For example: You're doing a simple API call and prefer hard coding the API key. You want explicit control without having to rely on automatic discovery of environment variables by the Gemini API libraries You're using an environment where environment variables are not supported (e.g web) or you are making REST calls. Below are examples for how you can provide an API key explicitly: Python from google import genai client = genai . Client ( api_key = " YOUR_API_KEY " ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Explain how AI works in a few words" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : " YOUR_API_KEY " }); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works in a few words" , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , & genai . \ No newline at end of file diff --git a/docstore/5aa9e3cb-7246-45d8-8f1e-a3d1c780e8e9 b/docstore/5aa9e3cb-7246-45d8-8f1e-a3d1c780e8e9 new file mode 100644 index 0000000000000000000000000000000000000000..4d2e24ea9938e140050ecec90b7146205f15036b --- /dev/null +++ b/docstore/5aa9e3cb-7246-45d8-8f1e-a3d1c780e8e9 @@ -0,0 +1 @@ +Embeddings | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Embeddings Note: Introducing our first Gemini embedding model, available now to developers as gemini-embedding-exp-03-07 in the API. The Gemini API supports several embedding models that generate embeddings for words, phrases, code, and sentences. The resulting embeddings can then be used for tasks such as semantic search, text classification, and clustering, among many others. What are embeddings? Embeddings are numerical representations of text (or other media formats) that capture relationships between inputs. Text embeddings work by converting text into arrays of floating point numbers, called vectors . These vectors are designed to capture the meaning of the text. The length of the embedding array is called the vector's dimensionality . A passage of text might be represented by a vector containing hundreds of dimensions. Embeddings capture semantic meaning and context, which results in text with similar meanings having "closer" embeddings. For example, the sentence "I took my dog to the vet" and "I took my cat to the vet" would have embeddings that are close to each other in the vector space. You can use embeddings to compare different texts and understand how they relate. For example, if the embeddings of the text "cat" and "dog" are close together you can infer that these words are similar in meaning, context, or both. This enables a variety of common AI use cases . Before you begin Before calling the Gemini API, ensure you have your SDK of choice installed, and a Gemini API key configured and ready to use. Generate embeddings Use the embedContent method to generate \ No newline at end of file diff --git a/docstore/5ab3fc92-dfb0-43c7-b515-ba98c32ca659 b/docstore/5ab3fc92-dfb0-43c7-b515-ba98c32ca659 new file mode 100644 index 0000000000000000000000000000000000000000..da790334610df2eb1eb6ac9bb30354d4f1eafe8b --- /dev/null +++ b/docstore/5ab3fc92-dfb0-43c7-b515-ba98c32ca659 @@ -0,0 +1 @@ +the output you want. Adding these examples can help the model identify the patterns and apply the relationship between the given images and responses to the new example. This is also called "few-shot" learning. In the example below, the initial output is written in sentence form, and also contains the country (Brazil). Suppose you want the output in a different format or style, and you want only the city, not the country. Adding few-shot examples to your prompt can steer the model to respond in the way you want. Prompt Model response Determine the city along with the landmark. The landmark is the Christ the Redeemer statue in Rio de Janeiro, Brazil. Updated prompt Improved response Determine the city along with the landmark. city: Rome, landmark: the Colosseum. city: Beijing, landmark: Forbidden City city: Rio de Janeiro, landmark: Christ the Redeemer statue Break it down step-by-step For complex tasks like the ones that require both visual understanding and reasoning, it can be helpful to split the task into smaller, more straightforward steps. Alternatively, it could also be effective if you directly ask the model to “think step by step” in your prompt. Prompt Model response When will I run out of toilet paper? Soon, you only have 3 rolls left. Updated prompt Improved response 1. First, count how many toilet paper rolls are in this picture. 2. Then, determine how much toilet paper a typical person uses per day. 3. Calculate how long these rolls of toilet paper will last. 1. There are 3 rolls of toilet paper in this picture. 2. A typical person uses about 20 sheets of toilet paper per day. 3. If each roll contains 200 sheets, then each roll will last for about 10 days. Therefore, the 3 rolls will last for about a month. Math problems or other types of word problems are great candidates for asking the model to think step-by-step. Prompt Response What is the 4th term in the sequence? -135 The response from the model is incorrect. Some ways to improve this is to ask \ No newline at end of file diff --git a/docstore/5ab993d7-8516-4bda-bc91-331c17df9f51 b/docstore/5ab993d7-8516-4bda-bc91-331c17df9f51 new file mode 100644 index 0000000000000000000000000000000000000000..3d93e98c2388d03b02165eba8a61d6ed193d8415 --- /dev/null +++ b/docstore/5ab993d7-8516-4bda-bc91-331c17df9f51 @@ -0,0 +1 @@ +Prompt design strategies | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Prompt design strategies Prompt design is the process of creating prompts, or natural language requests, that elicit accurate, high quality responses from a language model. This page introduces basic concepts, strategies, and best practices to get you started designing prompts to get the most out of Gemini AI models. Topic-specific prompt guides Looking for more specific prompt strategies? Check out our other prompting guides on: Prompting with media files Prompting for image generation Prompting for video generation Google AI Studio also hosts a prompt gallery meant to interactively showcase many of the concepts shared in this guide. Clear and specific instructions An effective and efficient way to customize model behavior is to provide it with clear and specific instructions. Instructions can be in the form of a question, step-by-step tasks, or as complex as mapping out a user's experience and mindset. Input Input is the required text in the prompt that you want the model to provide a response to. Inputs can be a question that the model answers (question input), a task the model performs (task input), an entity the model operates on (entity input), or partial input that the model completes or continues (completion input). Input type Prompt Generated output Question What's a good name for a flower shop that specializes in selling bouquets of dried flowers? Create a list of 5 options with just the names. Here are 10 names for a flower shop specializing in dried flowers: 1. Everlasting Blooms 2. Dried & Delightful 3. The Petal Preserve 4. Whispers of Wildflowers \ No newline at end of file diff --git a/docstore/5abab400-b806-4b1e-8144-dfba4d8fca26 b/docstore/5abab400-b806-4b1e-8144-dfba4d8fca26 new file mode 100644 index 0000000000000000000000000000000000000000..c6b31d98c2b16f02ca76b6157fd854c0b53727af --- /dev/null +++ b/docstore/5abab400-b806-4b1e-8144-dfba4d8fca26 @@ -0,0 +1 @@ +: [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Thinking with Gemini 2.5 2.5 Flash and Pro models have "thinking" enabled by default to enhance quality, which may take longer to run and increase token usage. When using 2.5 Flash, you can disable thinking by setting the thinking budget to zero. For more details, see the thinking guide . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # Disables thinking ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , config : { thinkingConfig : { thinkingBudget : 0 , // Disables thinking }, } }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "How does AI work?" ), & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : int32 ( 0 ), // Disables thinking }, } ) \ No newline at end of file diff --git a/docstore/5ad42bc6-e2cc-42b0-9786-8a4e4923764c b/docstore/5ad42bc6-e2cc-42b0-9786-8a4e4923764c new file mode 100644 index 0000000000000000000000000000000000000000..a3f65441166ab1d39d6f723c5ce4c04b54ac95e6 --- /dev/null +++ b/docstore/5ad42bc6-e2cc-42b0-9786-8a4e4923764c @@ -0,0 +1 @@ +values, use an enum. Tool Selection: While the model can use an arbitrary number of tools, providing too many can increase the risk of selecting an incorrect or suboptimal tool. For best results, aim to provide only the relevant tools for the context or task, ideally keeping the active set to a maximum of 10-20. Consider dynamic tool selection based on conversation context if you have a large total number of tools. Prompt Engineering: Provide context: Tell the model its role (e.g., "You are a helpful weather assistant."). Give instructions: Specify how and when to use functions (e.g., "Don't guess dates; always use a future date for forecasts."). Encourage clarification: Instruct the model to ask clarifying questions if needed. Temperature: Use a low temperature (e.g., 0) for more deterministic and reliable function calls. Validation: If a function call has significant consequences (e.g., placing an order), validate the call with the user before executing it. Error Handling : Implement robust error handling in your functions to gracefully handle unexpected inputs or API failures. Return informative error messages that the model can use to generate helpful responses to the user. Security: Be mindful of security when calling external APIs. Use appropriate authentication and authorization mechanisms. Avoid exposing sensitive data in function calls. Token Limits: Function descriptions and parameters count towards your input token limit. If you're hitting token limits, consider limiting the number of functions or the length of the descriptions, break down complex tasks into smaller, more focused function sets. Notes and limitations Only a subset of the OpenAPI schema is supported. Supported parameter types in Python are limited. Automatic function calling is a Python SDK feature only. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License \ No newline at end of file diff --git a/docstore/5ae4c3c0-4761-458c-b011-227dbaec8cff b/docstore/5ae4c3c0-4761-458c-b011-227dbaec8cff new file mode 100644 index 0000000000000000000000000000000000000000..448fb0d86f65fba6c1ebada31e49a4124dab31aa --- /dev/null +++ b/docstore/5ae4c3c0-4761-458c-b011-227dbaec8cff @@ -0,0 +1 @@ +message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "16000.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . log ( "Transcription" ) console . log ( turn . serverContent . outputTranscription . text ); } } for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } else if ( turn . serverContent && turn . serverContent . inputTranscription ) { console . debug ( 'Received input transcription: %s\n' , turn . serverContent . inputTranscription . text ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Stream audio and video To see an example of how to use the Live API in a streaming audio and video format, run the "Live API - Get Started" file in the cookbooks repository: View on Colab Change voice and language The Live API models each support a different set of voices. Half-cascade supports Puck, Charon, Kore, Fenrir, Aoede, Leda, Orus, and Zephyr. Native audio supports a much longer list (identical to the TTS model list \ No newline at end of file diff --git a/docstore/5b0d7d8d-09a8-4bcc-bdd1-cbc11b4fe7fd b/docstore/5b0d7d8d-09a8-4bcc-bdd1-cbc11b4fe7fd new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/5b0d7d8d-09a8-4bcc-bdd1-cbc11b4fe7fd @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/5b20a706-2f8d-40f5-ab99-0f379d99b945 b/docstore/5b20a706-2f8d-40f5-ab99-0f379d99b945 new file mode 100644 index 0000000000000000000000000000000000000000..6a0f5762f2e47222d475421a2613ce0f732fa260 --- /dev/null +++ b/docstore/5b20a706-2f8d-40f5-ab99-0f379d99b945 @@ -0,0 +1 @@ +in the Gemini API by setting clipping intervals or providing custom frame rate sampling. Tip: Video clipping and frames per second (FPS) are supported by all models, but the quality is significantly higher from 2.5 series models. Set clipping intervals You can clip video by specifying videoMetadata with start and end offsets. Python response = client . models . generate_content ( model = 'models/gemini-2.5-flash-preview-05-20' , contents = types . Content ( parts = [ types . Part ( file_data = types . FileData ( file_uri = 'https://www.youtube.com/watch?v=XEzRZ35urlk' ), video_metadata = types . VideoMetadata ( start_offset = '1250s' , end_offset = '1570s' ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) Set a custom frame rate You can set custom frame rate sampling by passing an fps argument to videoMetadata . Python # Only for videos of size <20Mb video_file_name = "/path/to/your/video.mp4" video_bytes = open ( video_file_name , 'rb' ) . read () response = client . models . generate_content ( model = 'models/gemini-2.5-flash-preview-05-20' , contents = types . Content ( parts = [ types . Part ( inline_data = types . Blob ( data = video_bytes , mime_type = 'video/mp4' ), video_metadata = types . VideoMetadata ( fps = 5 ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) By default 1 frame per second (FPS) is sampled from the video. You might want to set low FPS (< 1) for long videos. This is especially useful for mostly static videos (e.g. lectures). If you want to capture more details in rapidly changing visuals, consider setting a higher FPS value. Supported video formats Gemini supports the following video format MIME types: video/mp4 video/mpeg video/mov video/avi video/x-flv video/mpg video/webm video/wmv video/3gpp Technical details about videos Supported models & context : All Gemini 2.0 and 2.5 models can process video data. Models with a 2M context window can process videos up to 2 hours long at \ No newline at end of file diff --git a/docstore/5b2a29f2-214d-4f7e-8597-42ba731dc564 b/docstore/5b2a29f2-214d-4f7e-8597-42ba731dc564 new file mode 100644 index 0000000000000000000000000000000000000000..9139684952e56c77fdce37ab354efce80520ded1 --- /dev/null +++ b/docstore/5b2a29f2-214d-4f7e-8597-42ba731dc564 @@ -0,0 +1 @@ +dict ). When possible, the SDK will parse the returned JSON, and return the result in response.parsed . If you provided a pydantic class as the schema the SDK will convert that JSON to an instance of the class. from google import genai from pydantic import BaseModel client = genai . Client () class CountryInfo ( BaseModel ): name : str population : int capital : str continent : str major_cities : list [ str ] gdp : int official_language : str total_area_sq_mi : int response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'Give me information of the United States.' , config = { 'response_mime_type' : 'application/json' , 'response_schema' : CountryInfo , }, ) response . parsed JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "List a few popular cookie recipes." , config : { responseMimeType : "application/json" , responseSchema : { type : "array" , items : { type : "object" , properties : { recipeName : { type : "string" }, ingredients : { type : "array" , items : { type : "string" } }, }, required : [ "recipeName" , "ingredients" ], }, }, }, }); console . log ( response . text ); Files Upload Upload a file: Before Python import requests import pathlib import google.generativeai as genai # Download file response = requests . get ( 'https://storage.googleapis.com/generativeai-downloads/data/a11.txt' ) pathlib . Path ( 'a11.txt' ) . write_text ( response . text ) file = genai . upload_file ( path = 'a11.txt' ) model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ([ 'Can you summarize this file:' , my_file ]) print ( response . text ) After Python import requests import pathlib from google import genai client = genai . Client () # Download file response = requests . get ( 'https://storage.googleapis.com/generativeai-downloads/data/a11.txt' ) \ No newline at end of file diff --git a/docstore/5b429a36-8f01-4434-9174-dfab17103c6d b/docstore/5b429a36-8f01-4434-9174-dfab17103c6d new file mode 100644 index 0000000000000000000000000000000000000000..916ff1a2193937e7de5315b15bd19b2ba8208c75 --- /dev/null +++ b/docstore/5b429a36-8f01-4434-9174-dfab17103c6d @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.0-flash-lite Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/5b5203d7-9589-43c3-a1b8-963da65dc339 b/docstore/5b5203d7-9589-43c3-a1b8-963da65dc339 new file mode 100644 index 0000000000000000000000000000000000000000..9d095483ee0339b49f1b8cdd8e03c270e656b0d4 --- /dev/null +++ b/docstore/5b5203d7-9589-43c3-a1b8-963da65dc339 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/semantic_retrieval Title: Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/5b5b5f95-4a2b-45cd-9ebd-0f9877c41536 b/docstore/5b5b5f95-4a2b-45cd-9ebd-0f9877c41536 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/5b5b5f95-4a2b-45cd-9ebd-0f9877c41536 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/5b5fe3f8-4e36-4ac1-be8c-de5ce15424d1 b/docstore/5b5fe3f8-4e36-4ac1-be8c-de5ce15424d1 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/5b5fe3f8-4e36-4ac1-be8c-de5ce15424d1 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/5b66cee6-1493-4e79-87fb-967f789209e9 b/docstore/5b66cee6-1493-4e79-87fb-967f789209e9 new file mode 100644 index 0000000000000000000000000000000000000000..47c113cd9bacf84a434d531b6a240f0025b74130 --- /dev/null +++ b/docstore/5b66cee6-1493-4e79-87fb-967f789209e9 @@ -0,0 +1 @@ +. For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC. \ No newline at end of file diff --git a/docstore/5b6c43cd-6a68-46fd-8100-119d9efe85dc b/docstore/5b6c43cd-6a68-46fd-8100-119d9efe85dc new file mode 100644 index 0000000000000000000000000000000000000000..42fbfa8d3a1b9c27b4f54909cff17ace224a9de6 --- /dev/null +++ b/docstore/5b6c43cd-6a68-46fd-8100-119d9efe85dc @@ -0,0 +1 @@ +over a happy ' 'futuristic scifi city with lots of greenery?' ) response = client . models . generate_content ( model = "gemini-2.0-flash-preview-image-generation" , contents = contents , config = types . GenerateContentConfig ( response_modalities = [ 'TEXT' , 'IMAGE' ] ) ) for part in response . candidates [ 0 ] . content . parts : if part . text is not None : print ( part . text ) elif part . inline_data is not None : image = Image . open ( BytesIO (( part . inline_data . data ))) image . save ( 'gemini-native-image.png' ) image . show () JavaScript Note: We've released the Google SDK for TypeScript and JavaScript in preview launch stage . Use this SDK for image generation features. import { GoogleGenAI , Modality } from "@google/genai" ; import * as fs from "node:fs" ; async function main () { const ai = new GoogleGenAI ({}); const contents = "Hi, can you create a 3d rendered image of a pig " + "with wings and a top hat flying over a happy " + "futuristic scifi city with lots of greenery?" ; // Set responseModalities to include "Image" so the model can generate an image const response = await ai . models . generateContent ({ model : "gemini-2.0-flash-preview-image-generation" , contents : contents , config : { responseModalities : [ Modality . TEXT , Modality . IMAGE ], }, }); for ( const part of response . candidates [ 0 ]. content . parts ) { // Based on the part type, either show the text or save the image if ( part . text ) { console . log ( part . text ); } else if ( part . inlineData ) { const imageData = part . inlineData . data ; const buffer = Buffer . from ( imageData , "base64" ); fs . writeFileSync ( "gemini-native-image.png" , buffer ); console . log ( "Image saved as gemini-native-image.png" ); } } } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . \ No newline at end of file diff --git a/docstore/5b7f9f10-8f9b-4a8d-bef2-7e1f33a3d491 b/docstore/5b7f9f10-8f9b-4a8d-bef2-7e1f33a3d491 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/5b7f9f10-8f9b-4a8d-bef2-7e1f33a3d491 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/5b8fe83a-750d-44fc-bda8-2271b870b9d8 b/docstore/5b8fe83a-750d-44fc-bda8-2271b870b9d8 new file mode 100644 index 0000000000000000000000000000000000000000..7c883bd368c31d390cf31dfb7ab8807048f20c67 --- /dev/null +++ b/docstore/5b8fe83a-750d-44fc-bda8-2271b870b9d8 @@ -0,0 +1 @@ +caching price Not available $0.075 (text / image / video) $0.25 (audio) $1.00 / 1,000,000 tokens per hour (storage price) Grounding with Google Search Free of charge, up to 500 RPD (limit shared with Flash-Lite RPD) 1,500 RPD (free, limit shared with Flash-Lite RPD), then $35 / 1,000 requests Live API Free of charge Input: $0.50 (text), $3.00 (audio / image [video]) Output: $2.00 (text), $12.00 (audio) Used to improve our products Yes No Gemini 2.5 Flash-Lite Preview Try it in Google AI Studio Our smallest and most cost effective model, built for at scale usage. Preview models may change before becoming stable and have more restrictive rate limits. Free Tier Paid Tier, per 1M tokens in USD Input price (text, image, video) Free of charge $0.10 (text / image / video) $0.50 (audio) Output price (including thinking tokens) Free of charge $0.40 Context caching price Not available $0.025 (text / image / video) $0.125 (audio) $1.00 / 1,000,000 tokens per hour (storage price) Grounding with Google Search Free of charge, up to 500 RPD (limit shared with Flash RPD) 1,500 RPD (free, limit shared with Flash RPD), then $35 / 1,000 requests Used to improve our products Yes No Gemini 2.5 Flash Native Audio Try it in Google AI Studio Our native audio models optimized for higher quality audio outputs with better pacing, voice naturalness, verbosity, and mood. Preview models may change before becoming stable and have more restrictive rate limits. Free Tier Paid Tier, per 1M tokens in USD Input price Not available $0.50 (text) $3.00 (audio / video) Output price (including thinking tokens) Not available $2.00 (text) $12.00 (audio) Used to improve our products Yes No Gemini 2.5 Flash Preview TTS Try it in Google AI Studio Our 2.5 Flash text-to-speech audio model optimized for price-performant, low-latency, controllable speech generation. Preview models may change before becoming stable and have more restrictive rate limits. Free Tier Paid Tier, per 1M tokens in USD Input price Free of \ No newline at end of file diff --git a/docstore/5b96ec10-5413-483e-903d-aee5b3ffcba1 b/docstore/5b96ec10-5413-483e-903d-aee5b3ffcba1 new file mode 100644 index 0000000000000000000000000000000000000000..9dd2717ee06a5cb6666e2976e86c3492640fe1f5 --- /dev/null +++ b/docstore/5b96ec10-5413-483e-903d-aee5b3ffcba1 @@ -0,0 +1 @@ +'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Refer to the GenerateContentConfig in our API reference for a complete list of configurable parameters and their descriptions. Multimodal inputs The Gemini API supports multimodal inputs, allowing you to combine text with media files. The following example demonstrates providing an image: Python from PIL import Image from google import genai client = genai . Client () image = Image . open ( "/path/to/organ.png" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , "Tell me about this instrument" ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const image = await ai . files . upload ({ file : "/path/to/organ.png" , }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ createUserContent ([ "Tell me about this instrument" , createPartFromUri ( image . uri , image . mimeType ), ]), ], }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imagePath := "/path/to/organ.jpg" imgData , _ := os . ReadFile ( imagePath ) parts := [] * genai . Part { genai . NewPartFromText ( "Tell me about this instrument" ), & genai . Part { InlineData : & genai . Blob { MIMEType : "image/jpeg" , Data : imgData , \ No newline at end of file diff --git a/docstore/5bb14613-85c7-4104-a72f-2fe045881a1d b/docstore/5bb14613-85c7-4104-a72f-2fe045881a1d new file mode 100644 index 0000000000000000000000000000000000000000..6b5570dc552776eef13cf8339199673fd1c28eb5 --- /dev/null +++ b/docstore/5bb14613-85c7-4104-a72f-2fe045881a1d @@ -0,0 +1 @@ +Generate an image Note: Image generation is only available in the paid tier. Generate an image: Python import base64 from openai import OpenAI from PIL import Image from io import BytesIO client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" , ) response = client . images . generate ( model = "imagen-3.0-generate-002" , prompt = "a portrait of a sheepadoodle wearing a cape" , response_format = 'b64_json' , n = 1 , ) for image_data in response . data : image = Image . open ( BytesIO ( base64 . b64decode ( image_data . b64_json ))) image . show () JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); async function main () { const image = await openai . images . generate ( { model : "imagen-3.0-generate-002" , prompt : "a portrait of a sheepadoodle wearing a cape" , response_format : "b64_json" , n : 1 , } ); console . log ( image . data ); } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/images/generations" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "imagen-3.0-generate-002", "prompt": "a portrait of a sheepadoodle wearing a cape", "response_format": "b64_json", "n": 1, }' Audio understanding Analyze audio input: Python import base64 from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) with open ( "/path/to/your/audio/file.wav" , "rb" ) as audio_file : base64_audio = base64 . b64encode ( audio_file . read ()) . decode ( 'utf-8' ) response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = [ { "role" : "user" , "content" : [ { "type" : "text" , "text" : "Transcribe this audio" , }, { "type" : "input_audio" , "input_audio" : { "data" : base64_audio , "format" : "wav" } } ], } ], ) print \ No newline at end of file diff --git a/docstore/5bb44183-9ea3-4068-9666-6db5f0f9155d b/docstore/5bb44183-9ea3-4068-9666-6db5f0f9155d new file mode 100644 index 0000000000000000000000000000000000000000..ec6cba9f5d0ceb3b74c56797939372d30da827c9 --- /dev/null +++ b/docstore/5bb44183-9ea3-4068-9666-6db5f0f9155d @@ -0,0 +1 @@ += "gemini-2.0-flash" , contents = """Generate a short transcript around 100 words that reads like it was clipped from a podcast by excited herpetologists. The hosts names are Dr. Anya and Liam.""" ) . text response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = transcript , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( multi_speaker_voice_config = types . MultiSpeakerVoiceConfig ( speaker_voice_configs = [ types . SpeakerVoiceConfig ( speaker = 'Dr. Anya' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), types . SpeakerVoiceConfig ( speaker = 'Liam' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Puck' , ) ) ), ] ) ) ) ) # ...Code to stream or save the output JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const transcript = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Generate a short transcript around 100 words that reads like it was clipped from a podcast by excited herpetologists. The hosts names are Dr. Anya and Liam." , }) const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : transcript , config : { responseModalities : [ 'AUDIO' ], speechConfig : { multiSpeakerVoiceConfig : { speakerVoiceConfigs : [ { speaker : "Dr. Anya" , voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" }, } }, { speaker : "Liam" , voiceConfig : { prebuiltVoiceConfig : { voiceName : "Puck" }, } } ] } } } }); } // ..JavaScript code for exporting .wav file for output audio await main (); Voice options TTS models support the following 30 voice options in the voice_name field: Zephyr -- Bright Puck -- Upbeat Charon -- Informative Kore -- Firm Fenrir -- Excitable Leda -- Youthful Orus -- Firm \ No newline at end of file diff --git a/docstore/5bc01182-cbb0-4058-a803-c2334fe0acef b/docstore/5bc01182-cbb0-4058-a803-c2334fe0acef new file mode 100644 index 0000000000000000000000000000000000000000..448fb0d86f65fba6c1ebada31e49a4124dab31aa --- /dev/null +++ b/docstore/5bc01182-cbb0-4058-a803-c2334fe0acef @@ -0,0 +1 @@ +message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "16000.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . log ( "Transcription" ) console . log ( turn . serverContent . outputTranscription . text ); } } for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } else if ( turn . serverContent && turn . serverContent . inputTranscription ) { console . debug ( 'Received input transcription: %s\n' , turn . serverContent . inputTranscription . text ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Stream audio and video To see an example of how to use the Live API in a streaming audio and video format, run the "Live API - Get Started" file in the cookbooks repository: View on Colab Change voice and language The Live API models each support a different set of voices. Half-cascade supports Puck, Charon, Kore, Fenrir, Aoede, Leda, Orus, and Zephyr. Native audio supports a much longer list (identical to the TTS model list \ No newline at end of file diff --git a/docstore/5bd2af38-2302-478a-be7c-77530186c9f4 b/docstore/5bd2af38-2302-478a-be7c-77530186c9f4 new file mode 100644 index 0000000000000000000000000000000000000000..35c81ca457815c8b08cb0a15d57e514821a77fa5 --- /dev/null +++ b/docstore/5bd2af38-2302-478a-be7c-77530186c9f4 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/function-calling?example=weather#step-4 Title: Function calling with the Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/5bf06ee6-14b4-422c-b49b-708773bf9081 b/docstore/5bf06ee6-14b4-422c-b49b-708773bf9081 new file mode 100644 index 0000000000000000000000000000000000000000..15baa9558f9c519c1fd89e2f300757ea3d7d33d9 --- /dev/null +++ b/docstore/5bf06ee6-14b4-422c-b49b-708773bf9081 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting_with_media?lang=python#troubleshooting Title: Files API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/5c0a92bb-8a15-4784-9370-0b9da328922f b/docstore/5c0a92bb-8a15-4784-9370-0b9da328922f new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/5c0a92bb-8a15-4784-9370-0b9da328922f @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/5c1792c1-d776-4be7-9bbf-0de97b47702b b/docstore/5c1792c1-d776-4be7-9bbf-0de97b47702b new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/5c1792c1-d776-4be7-9bbf-0de97b47702b @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/5c237a97-82a2-4882-b06d-11891090c514 b/docstore/5c237a97-82a2-4882-b06d-11891090c514 new file mode 100644 index 0000000000000000000000000000000000000000..a8bbf59ddedb6c7213df1bd811fe6d2ce8316cfa --- /dev/null +++ b/docstore/5c237a97-82a2-4882-b06d-11891090c514 @@ -0,0 +1 @@ +. live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-exp-native-audio-thinking-dialog' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Voice Activity Detection (VAD) Voice Activity Detection (VAD) allows the model to recognize when a person is speaking. This is essential for creating natural conversations, as it allows a user to interrupt the model at any time. When VAD detects an interruption, the ongoing generation is canceled and discarded. Only the information already sent to the client is retained in the session history. The server then sends a BidiGenerateContentServerContent message to report the interruption. The Gemini server then discards any pending function calls and sends a BidiGenerateContentServerContent message with the IDs of the canceled calls. Python async for response in session . receive (): if response . server_content . interrupted is True : # The generation was interrupted # If realtime playback is implemented in your application, # you should stop playing audio and clear queued playback here. JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . interrupted ) { // The generation was interrupted // If realtime playback is implemented in your application, // you should stop playing audio and clear queued playback here. } } Automatic VAD By default, the model automatically performs VAD on a continuous audio input stream. VAD can be configured with the realtimeInputConfig.automaticActivityDetection field of the setup configuration . When the audio stream is paused for more than a second (for example, because the user switched off the microphone), an audioStreamEnd event \ No newline at end of file diff --git a/docstore/5c26f564-d967-4815-8660-979ff988376f b/docstore/5c26f564-d967-4815-8660-979ff988376f new file mode 100644 index 0000000000000000000000000000000000000000..b58c189db2b124c61872134bdd4b3d786a4e18e6 --- /dev/null +++ b/docstore/5c26f564-d967-4815-8660-979ff988376f @@ -0,0 +1 @@ +you can read about configuring function calling . Python from google import genai from google.genai import types # Configure the client and tools client = genai . Client () house_tools = [ types . Tool ( function_declarations = [ power_disco_ball , start_music , dim_lights ]) ] config = types . GenerateContentConfig ( tools = house_tools , automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ), # Force the model to call 'any' function, instead of chatting. tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = 'ANY' ) ), ) chat = client . chats . create ( model = "gemini-2.5-flash" , config = config ) response = chat . send_message ( "Turn this place into a party!" ) # Print out each of the function calls requested from this single call print ( "Example 1: Forced function calling" ) for fn in response . function_calls : args = ", " . join ( f " { key } = { val } " for key , val in fn . args . items ()) print ( f " { fn . name } ( { args } )" ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Set up function declarations const houseFns = [ powerDiscoBall , startMusic , dimLights ]; const config = { tools : [{ functionDeclarations : houseFns }], // Force the model to call 'any' function, instead of chatting. toolConfig : { functionCallingConfig : { mode : 'any' } } }; // Configure the client const ai = new GoogleGenAI ({}); // Create a chat session const chat = ai . chats . create ({ model : 'gemini-2.5-flash' , config : config }); const response = await chat . sendMessage ({ message : 'Turn this place into a party!' }); // Print out each of the function calls requested from this single call console . log ( "Example 1: Forced function calling" ); for ( const fn of response . functionCalls ) { const args = Object . entries ( fn . args ) . map (([ key , val ]) = > ` ${ key } = ${ val } ` ) . join ( ', ' ); console . log ( ` ${ fn . name } ( ${ args } )` ); } Each of the printed \ No newline at end of file diff --git a/docstore/5c31c00b-7afd-4cad-9cc0-a29c25198e68 b/docstore/5c31c00b-7afd-4cad-9cc0-a29c25198e68 new file mode 100644 index 0000000000000000000000000000000000000000..665a477ea8352b1598262b3124a473a18fa8289a --- /dev/null +++ b/docstore/5c31c00b-7afd-4cad-9cc0-a29c25198e68 @@ -0,0 +1 @@ +professional, detailed The following are a few examples of prompts without quality modifiers and the same prompt with quality modifiers. Prompt (no quality modifiers): a photo of a corn stalk Prompt (with quality modifiers): 4k HDR beautiful photo of a corn stalk taken by a professional photographer Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Aspect ratios Imagen image generation lets you set five distinct image aspect ratios. Square (1:1, default) - A standard square photo. Common uses for this aspect ratio include social media posts. Fullscreen (4:3) - This aspect ratio is commonly used in media or film. It is also the dimensions of most old (non-widescreen) TVs and medium format cameras. It captures more of the scene horizontally (compared to 1:1), making it a preferred aspect ratio for photography. Prompt: close up of a musician's fingers playing the piano, black and white film, vintage (4:3 aspect ratio) Prompt: A professional studio photo of french fries for a high end restaurant, in the style of a food magazine (4:3 aspect ratio) Portrait full screen (3:4) - This is the fullscreen aspect ratio rotated 90 degrees. This lets to capture more of the scene vertically compared to the 1:1 aspect ratio. Prompt: a woman hiking, close of her boots reflected in a puddle, large mountains in the background, in the style of an advertisement, dramatic angles (3:4 aspect ratio) Prompt: aerial shot of a river flowing up a mystical valley (3:4 aspect ratio) Widescreen (16:9) - This ratio has replaced 4:3 and is now the most common aspect ratio for TVs, monitors, and mobile phone screens (landscape). Use this aspect ratio when you want to capture more of the background (for example, scenic landscapes). Prompt: a man wearing all white clothing sitting on the beach, close up, golden hour lighting (16:9 aspect ratio) Portrait (9:16) - This ratio is widescreen but rotated. This a relatively new aspect ratio that has been \ No newline at end of file diff --git a/docstore/5c364e72-86f4-4428-95ba-ca459433499b b/docstore/5c364e72-86f4-4428-95ba-ca459433499b new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/5c364e72-86f4-4428-95ba-ca459433499b @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/5c38b054-9182-4727-87af-348773ecb388 b/docstore/5c38b054-9182-4727-87af-348773ecb388 new file mode 100644 index 0000000000000000000000000000000000000000..acef50f8402c486c348013efe146de15b88cb32b --- /dev/null +++ b/docstore/5c38b054-9182-4727-87af-348773ecb388 @@ -0,0 +1 @@ +Function calling with the Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Function calling with the Gemini API Function calling lets you connect models to external tools and APIs. Instead of generating text responses, the model determines when to call specific functions and provides the necessary parameters to execute real-world actions. This allows the model to act as a bridge between natural language and real-world actions and data. Function calling has 3 primary use cases: Augment Knowledge: Access information from external sources like databases, APIs, and knowledge bases. Extend Capabilities: Use external tools to perform computations and extend the limitations of the model, such as using a calculator or creating charts. Take Actions: Interact with external systems using APIs, such as scheduling appointments, creating invoices, sending emails, or controlling smart home devices. Get Weather Schedule Meeting Create Chart How function calling works Function calling involves a structured interaction between your application, the model, and external functions. Here's a breakdown of the process: Define Function Declaration: Define the function declaration in your application code. Function Declarations describe the function's name, parameters, and purpose to the model. Call LLM with function declarations: Send user prompt along with the function declaration(s) to the model. It analyzes the request and determines if a function call would be helpful. If so, it responds with a structured JSON object. Execute Function Code (Your Responsibility): The Model does not execute the function itself. It's your application's responsibility to \ No newline at end of file diff --git a/docstore/5c6e92dd-c8b0-4333-b928-fad33fead10f b/docstore/5c6e92dd-c8b0-4333-b928-fad33fead10f new file mode 100644 index 0000000000000000000000000000000000000000..2437f77cb02a7dfc3b66d950f0fe4ad8777ea66f --- /dev/null +++ b/docstore/5c6e92dd-c8b0-4333-b928-fad33fead10f @@ -0,0 +1 @@ +SpeakerVoiceConfig ( speaker = 'Joe' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), types . SpeakerVoiceConfig ( speaker = 'Jane' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Puck' , ) ) ), ] ) ) ) ) data = response . candidates [ 0 ] . content . parts [ 0 ] . inline_data . data file_name = 'out.wav' wave_file ( file_name , data ) # Saves the file to current directory JavaScript import { GoogleGenAI } from '@google/genai' ; import wav from 'wav' ; async function saveWaveFile ( filename , pcmData , channels = 1 , rate = 24000 , sampleWidth = 2 , ) { return new Promise (( resolve , reject ) = > { const writer = new wav . FileWriter ( filename , { channels , sampleRate : rate , bitDepth : sampleWidth * 8 , }); writer . on ( 'finish' , resolve ); writer . on ( 'error' , reject ); writer . write ( pcmData ); writer . end (); }); } async function main () { const ai = new GoogleGenAI ({}); const prompt = `TTS the following conversation between Joe and Jane: Joe: How's it going today Jane? Jane: Not too bad, how about you?` ; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : [{ parts : [{ text : prompt }] }], config : { responseModalities : [ 'AUDIO' ], speechConfig : { multiSpeakerVoiceConfig : { speakerVoiceConfigs : [ { speaker : 'Joe' , voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Kore' } } }, { speaker : 'Jane' , voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Puck' } } } ] } } } }); const data = response . candidates ? .[ 0 ] ? . content ? . parts ? .[ 0 ] ? . inlineData ? . data ; const audioBuffer = Buffer . from ( data , 'base64' ); const fileName = 'out.wav' ; await saveWaveFile ( fileName , audioBuffer ); } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \ -H "x-goog-api-key: \ No newline at end of file diff --git a/docstore/5c978f5a-9ad1-4745-89fd-3c72f5e652d3 b/docstore/5c978f5a-9ad1-4745-89fd-3c72f5e652d3 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/5c978f5a-9ad1-4745-89fd-3c72f5e652d3 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/5ca5706a-6429-4bd1-ac59-a8e05964fe71 b/docstore/5ca5706a-6429-4bd1-ac59-a8e05964fe71 new file mode 100644 index 0000000000000000000000000000000000000000..ef7f15fc424a675301db24205f149dd75b0faa06 --- /dev/null +++ b/docstore/5ca5706a-6429-4bd1-ac59-a8e05964fe71 @@ -0,0 +1 @@ +Ephemeral tokens | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Ephemeral tokens Ephemeral tokens are short-lived authentication tokens for accessing the Gemini API through WebSockets . They are designed to enhance security when you are connecting directly from a user's device to the API (a client-to-server implementation). Like standard API keys, ephemeral tokens can be extracted from client-side applications such as web browsers or mobile apps. But because ephemeral tokens expire quickly and can be restricted, they significantly reduce the security risks in a production environment. Note: Ephemeral tokens are only compatible with Live API at this time. You should use them when accessing the Live API directly from client-side applications to enhance API key security. How ephemeral tokens work Here's how ephemeral tokens work at a high level: Your client (e.g. web app) authenticates with your backend. Your backend requests an ephemeral token from Gemini API's provisioning service. Gemini API issues a short-lived token. Your backend sends the token to the client for WebSocket connections to Live API. You can do this by swapping your API key with an ephemeral token. The client then uses the token as if it were an API key. This enhances security because even if extracted, the token is short-lived, unlike a long-lived API key deployed client-side. Since the client sends data directly to Gemini, this also improves latency and avoids your backends needing to proxy the real time data. Create an ephemeral token Here is a simplified example of how to get an ephemeral token from Gemini. By default, you'll have 1 minute to start new Live API \ No newline at end of file diff --git a/docstore/5ce7ee05-3fc6-4953-9778-27c94459f984 b/docstore/5ce7ee05-3fc6-4953-9778-27c94459f984 new file mode 100644 index 0000000000000000000000000000000000000000..7e8ad5dab4abf30afa5f2a7a1bb2bb58fcb2f5d0 --- /dev/null +++ b/docstore/5ce7ee05-3fc6-4953-9778-27c94459f984 @@ -0,0 +1 @@ +-X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"' " $MIME_TYPE " '", "data": "' " $IMAGE_B64 " '" } }, {"text": "Caption this image."} ] }] }' 2 > /dev/null Note: Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly. Uploading images using the File API For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to generateContent . See the Files API guide for more information and examples. Python from google import genai client = genai . Client () my_file = client . files . upload ( file = "path/to/sample.jpg" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ my_file , "Caption this image." ], ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.jpg" , config : { mimeType : "image/jpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Caption this image." , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.jpg" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Caption this image." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * \ No newline at end of file diff --git a/docstore/5cfe6022-a895-4f43-8b2e-86ee9c7d0b7a b/docstore/5cfe6022-a895-4f43-8b2e-86ee9c7d0b7a new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/5cfe6022-a895-4f43-8b2e-86ee9c7d0b7a @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/5d2aca8a-1459-4a60-a37f-96a11d40aab0 b/docstore/5d2aca8a-1459-4a60-a37f-96a11d40aab0 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/5d2aca8a-1459-4a60-a37f-96a11d40aab0 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/5d3c7d02-0d78-4f6b-9b0e-331d8cc431d6 b/docstore/5d3c7d02-0d78-4f6b-9b0e-331d8cc431d6 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/5d3c7d02-0d78-4f6b-9b0e-331d8cc431d6 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/5d46a9f4-21e1-4816-adb0-36384d0783a1 b/docstore/5d46a9f4-21e1-4816-adb0-36384d0783a1 new file mode 100644 index 0000000000000000000000000000000000000000..71a40ddf8f5869779c79b3a864f86f411dc17d60 --- /dev/null +++ b/docstore/5d46a9f4-21e1-4816-adb0-36384d0783a1 @@ -0,0 +1 @@ +correct reasoning steps afterward. To disambiguate between those reasons, ask the model to describe what's in the image. In the following example, if the model responds with a snack that seems surprising when paired with tea (e.g. popcorn), you can first troubleshoot to determine whether the model correctly recognized that the image contains tea. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? Describe what's in this image. Another strategy is to ask the model to explain its reasoning. That can help you narrow down which part of the reasoning broke down, if any. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? What's a snack I can make in 1 minute that would go well with this? Please explain why. What's next Try writing your own multimodal prompts using Google AI Studio . For information on using the Gemini Files API for uploading media files and including them in your prompts, see the Vision , Audio , and Document processing guides. For more guidance on prompt design, like tuning sampling parameters, see the Prompt strategies page. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/5d710960-d8fa-4ca9-8cca-7dcf05b5b459 b/docstore/5d710960-d8fa-4ca9-8cca-7dcf05b5b459 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/5d710960-d8fa-4ca9-8cca-7dcf05b5b459 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/5d953776-b5e1-4c11-8420-7c66b20e03a6 b/docstore/5d953776-b5e1-4c11-8420-7c66b20e03a6 new file mode 100644 index 0000000000000000000000000000000000000000..0d39ec227c9e73c3d3b5117917b80f593b500b4b --- /dev/null +++ b/docstore/5d953776-b5e1-4c11-8420-7c66b20e03a6 @@ -0,0 +1 @@ +async function main () { const ai = new GoogleGenAI ({}); const imageUrl = "https://goo.gle/instrument-img" ; const response = await fetch ( imageUrl ); const imageArrayBuffer = await response . arrayBuffer (); const base64ImageData = Buffer . from ( imageArrayBuffer ). toString ( 'base64' ); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ { inlineData : { mimeType : 'image/jpeg' , data : base64ImageData , }, }, { text : "Caption this image." } ], }); console . log ( result . text ); } main (); Go package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Download the image. imageResp , _ := http . Get ( "https://goo.gle/instrument-img" ) imageBytes , _ := io . ReadAll ( imageResp . Body ) parts := [] * genai . Part { genai . NewPartFromBytes ( imageBytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMG_URL = "https://goo.gle/instrument-img" MIME_TYPE = $( curl -sIL " $IMG_URL " | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1 ) if [[ -z " $MIME_TYPE " || ! " $MIME_TYPE " == image/* ]] ; then MIME_TYPE = "image/jpeg" fi # Check for macOS if [[ " $( uname ) " == "Darwin" ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -b 0 ) elif [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 ) else IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -w0 ) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ \ No newline at end of file diff --git a/docstore/5da0ea18-30aa-4d71-bb93-5ffd368ea02e b/docstore/5da0ea18-30aa-4d71-bb93-5ffd368ea02e new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/5da0ea18-30aa-4d71-bb93-5ffd368ea02e @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/5db98079-6aa2-450d-8ca8-9995339bb7f8 b/docstore/5db98079-6aa2-450d-8ca8-9995339bb7f8 new file mode 100644 index 0000000000000000000000000000000000000000..6e71e94222e9c44768c28e09ebada72b5ff1e76f --- /dev/null +++ b/docstore/5db98079-6aa2-450d-8ca8-9995339bb7f8 @@ -0,0 +1 @@ +writeFileSync ( `imagen- ${ idx } .png` , buffer ); idx ++ ; } } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateImagesConfig { NumberOfImages : 4 , } response , _ := client . Models . GenerateImages ( ctx , "imagen-4.0-generate-preview-06-06" , "Robot holding a red skateboard" , config , ) for n , image := range response . GeneratedImages { fname := fmt . Sprintf ( "imagen-%d.png" , n ) _ = os . WriteFile ( fname , image . Image . ImageBytes , 0644 ) } } REST curl -X POST \ "https://generativelanguage.googleapis.com/v1beta/models/imagen-4.0-generate-preview-06-06:predict" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -d '{ "instances": [ { "prompt": "Robot holding a red skateboard" } ], "parameters": { "sampleCount": 4 } }' AI-generated image of a robot holding a red skateboard Imagen configuration Imagen supports English only prompts at this time and the following parameters: Note: Naming conventions of parameters vary by programming language. numberOfImages : The number of images to generate, from 1 to 4 (inclusive). The default is 4. For Imagen 4 Ultra, it defaults to 1 as only one image can be generated at a time. aspectRatio : Changes the aspect ratio of the generated image. Supported values are "1:1" , "3:4" , "4:3" , "9:16" , and "16:9" . The default is "1:1" . personGeneration : Allow the model to generate images of people. The following values are supported: "dont_allow" : Block generation of images of people. "allow_adult" : Generate images of adults, but not children. This is the default. "allow_all" : Generate images that include adults and children. Note: The "allow_all" parameter value is not allowed in EU, UK, CH, MENA locations. Choosing the right model Choose Gemini when: You need contextually relevant images that leverage \ No newline at end of file diff --git a/docstore/5dc02e03-b84a-4b15-8bf2-8c3669a7ee17 b/docstore/5dc02e03-b84a-4b15-8bf2-8c3669a7ee17 new file mode 100644 index 0000000000000000000000000000000000000000..65512c01f777d6e5e3b37a55514cc97b01015837 --- /dev/null +++ b/docstore/5dc02e03-b84a-4b15-8bf2-8c3669a7ee17 @@ -0,0 +1 @@ +Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Affective dialog This feature lets Gemini adapt its response style to the input expression and tone. To use affective dialog, set the api version to v1alpha and set enable_affective_dialog to true in the setup message: Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], enable_affective_dialog = True ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], enableAffectiveDialog : true }; Note that affective dialog is currently only supported by the native audio output models. Proactive audio When this feature is enabled, Gemini can proactively decide not to respond if the content is not relevant. To use it, set the api version to v1alpha and configure the proactivity field in the setup message and set proactive_audio to true : Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], proactivity = { 'proactive_audio' : True } ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], proactivity : { proactiveAudio : true } } Note that proactive audio is currently only supported by the native audio output models. Native audio output with thinking Native audio output supports thinking capabilities , available via a separate model gemini-2.5-flash-exp-native-audio-thinking-dialog . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-exp-native-audio-thinking-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio \ No newline at end of file diff --git a/docstore/5dd65228-fb43-4e67-910f-e7ecedbbb56b b/docstore/5dd65228-fb43-4e67-910f-e7ecedbbb56b new file mode 100644 index 0000000000000000000000000000000000000000..b56dfbca2298dddb94eb66ba2a6b5e2a4d7109fa --- /dev/null +++ b/docstore/5dd65228-fb43-4e67-910f-e7ecedbbb56b @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "' " ${ MIME1_TYPE } " '", "file_uri": ' $file1_uri '}}, { "inline_data": { "mime_type":"' " ${ MIME2_TYPE } " '", "data": "' " $IMAGE2_BASE64 " '" } } ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Object detection From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. Python from google import genai from google.genai import types from PIL import Image import json client = genai . Client () prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image . open ( "/path/to/image.png" ) config = types . GenerateContentConfig ( response_mime_type = "application/json" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , prompt ], config = config ) width , height = image . size bounding_boxes = json . loads ( response . text ) converted_bounding_boxes = [] for bounding_box in bounding_boxes : abs_y1 = int ( bounding_box [ "box_2d" ][ 0 ] / 1000 * height ) abs_x1 = int ( bounding_box [ "box_2d" ][ 1 ] / 1000 * width ) abs_y2 = int ( bounding_box [ "box_2d" ][ 2 ] / 1000 * height ) abs_x2 = int ( bounding_box [ "box_2d" ][ 3 ] / 1000 * width ) converted_bounding_boxes . append ([ abs_x1 , abs_y1 , abs_x2 , abs_y2 ]) print ( "Image size: " , width , height ) print ( "Bounding boxes:" , converted_bounding_boxes ) Note: The model also supports generating bounding boxes based on custom \ No newline at end of file diff --git a/docstore/5ddaeba0-2287-459b-9054-2d97d1422c1a b/docstore/5ddaeba0-2287-459b-9054-2d97d1422c1a new file mode 100644 index 0000000000000000000000000000000000000000..a1daa64b7091fa9e5594d9222d90d8c8ad6521d2 --- /dev/null +++ b/docstore/5ddaeba0-2287-459b-9054-2d97d1422c1a @@ -0,0 +1 @@ +). You can listen to all the voices in AI Studio . To specify a voice, set the voice name within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "voice_config" : { "prebuilt_voice_config" : { "voice_name" : "Kore" }} }, } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" } } } }; Note: If you're using the generateContent API, the set of available voices is slightly different. See the audio generation guide for generateContent audio generation voices. The Live API supports multiple languages . To change the language, set the language code within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "language_code" : "de-DE" } } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { languageCode : "de-DE" } }; Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Native audio capabilities The following capabilities are only available with native audio. You can learn more about native audio in Choose a model and audio generation . Note: Native audio models currently have limited tool use support. See Overview of supported tools for details. How to use native audio output To use native audio output, configure one of the native audio models and set response_modalities to AUDIO . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-preview-native-audio-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio . live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-preview-native-audio-dialog' ; const config = { responseModalities : [ \ No newline at end of file diff --git a/docstore/5dde44ac-31d0-45cf-afc8-777e61e941db b/docstore/5dde44ac-31d0-45cf-afc8-777e61e941db new file mode 100644 index 0000000000000000000000000000000000000000..eb9021b0c62d8225572ffa88a8581263b0a8ad78 --- /dev/null +++ b/docstore/5dde44ac-31d0-45cf-afc8-777e61e941db @@ -0,0 +1 @@ +and their capabilities, visit the Models page. Best practices Prompting tips For basic text generation, a zero-shot prompt often suffices without needing examples, system instructions or specific formatting. For more tailored outputs: Use System instructions to guide the model. Provide few example inputs and outputs to guide the model. This is often referred to as few-shot prompting. Consult our prompt engineering guide for more tips. Structured output In some cases, you may need structured output, such as JSON. Refer to our structured output guide to learn how. What's next Try the Gemini API getting started Colab . Explore Gemini's image , video , audio and document understanding capabilities. Learn about multimodal file prompting strategies . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/5dfa8dc8-d883-4db3-aae1-bb482d14f0f7 b/docstore/5dfa8dc8-d883-4db3-aae1-bb482d14f0f7 new file mode 100644 index 0000000000000000000000000000000000000000..3d93e98c2388d03b02165eba8a61d6ed193d8415 --- /dev/null +++ b/docstore/5dfa8dc8-d883-4db3-aae1-bb482d14f0f7 @@ -0,0 +1 @@ +Prompt design strategies | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Prompt design strategies Prompt design is the process of creating prompts, or natural language requests, that elicit accurate, high quality responses from a language model. This page introduces basic concepts, strategies, and best practices to get you started designing prompts to get the most out of Gemini AI models. Topic-specific prompt guides Looking for more specific prompt strategies? Check out our other prompting guides on: Prompting with media files Prompting for image generation Prompting for video generation Google AI Studio also hosts a prompt gallery meant to interactively showcase many of the concepts shared in this guide. Clear and specific instructions An effective and efficient way to customize model behavior is to provide it with clear and specific instructions. Instructions can be in the form of a question, step-by-step tasks, or as complex as mapping out a user's experience and mindset. Input Input is the required text in the prompt that you want the model to provide a response to. Inputs can be a question that the model answers (question input), a task the model performs (task input), an entity the model operates on (entity input), or partial input that the model completes or continues (completion input). Input type Prompt Generated output Question What's a good name for a flower shop that specializes in selling bouquets of dried flowers? Create a list of 5 options with just the names. Here are 10 names for a flower shop specializing in dried flowers: 1. Everlasting Blooms 2. Dried & Delightful 3. The Petal Preserve 4. Whispers of Wildflowers \ No newline at end of file diff --git a/docstore/5e00f5b2-b812-4e9f-af22-f2e4d1b70fc7 b/docstore/5e00f5b2-b812-4e9f-af22-f2e4d1b70fc7 new file mode 100644 index 0000000000000000000000000000000000000000..bdac38d149644927b7d126aee3930efd52a696eb --- /dev/null +++ b/docstore/5e00f5b2-b812-4e9f-af22-f2e4d1b70fc7 @@ -0,0 +1 @@ +turn_off_the_lights_schema = { 'name' : 'turn_off_the_lights' } prompt = """ Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? """ tools = [ { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} ] await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Light control schemas const turnOnTheLightsSchema = { name : 'turn_on_the_lights' }; const turnOffTheLightsSchema = { name : 'turn_off_the_lights' }; const prompt = ` Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? ` ; const tools = [ { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } ]; await run ( prompt , tools = tools , modality = "AUDIO" ) Function calling modes The Gemini API lets you control how the model uses the provided tools (function declarations). Specifically, you can set the mode within the. function_calling_config . AUTO (Default) : The model decides whether to generate a natural language response or suggest a function call based on the prompt and context. This is the most flexible mode and recommended for most scenarios. ANY : The model is constrained to always predict a function call and guarantees function schema adherence. If allowed_function_names is not specified, the model can choose from any of the provided function declarations. If allowed_function_names is provided as a list, the model can only choose from the functions in that list. Use this mode when you require a function call response to every prompt (if applicable). NONE : The model is prohibited from making function calls. This is equivalent to sending a request without any function declarations. Use this to temporarily disable function calling without removing your tool definitions. Python from google.genai import types # Configure function calling mode tool_config = types . ToolConfig ( \ No newline at end of file diff --git a/docstore/5e296c1b-ddba-49a9-adea-bcd1f60a472c b/docstore/5e296c1b-ddba-49a9-adea-bcd1f60a472c new file mode 100644 index 0000000000000000000000000000000000000000..8219f10e184a0891e4bb35822a37a2ddc4e20372 --- /dev/null +++ b/docstore/5e296c1b-ddba-49a9-adea-bcd1f60a472c @@ -0,0 +1 @@ +"type": "STRING" }, "ingredients": { "type": "ARRAY", "items": { "type": "STRING" } } }, "propertyOrdering": ["recipeName", "ingredients"] } } } }' 2 > /dev/null | head The output might look like this: [ { "recipeName" : "Chocolate Chip Cookies" , "ingredients" : [ "1 cup (2 sticks) unsalted butter, softened" , "3/4 cup granulated sugar" , "3/4 cup packed brown sugar" , "1 teaspoon vanilla extract" , "2 large eggs" , "2 1/4 cups all-purpose flour" , "1 teaspoon baking soda" , "1 teaspoon salt" , "2 cups chocolate chips" ] }, ... ] Providing a schema in a text prompt Instead of configuring a schema, you can supply a schema as natural language or pseudo-code in a text prompt. This method is not recommended , because it might produce lower quality output, and because the model is not constrained to follow the schema. Warning: Don't provide a schema in a text prompt if you're configuring a responseSchema . This can produce unexpected or low quality results. Here's a generic example of a schema provided in a text prompt: List a few popular cookie recipes, and include the amounts of ingredients. Produce JSON matching this specification: Recipe = { "recipeName": string, "ingredients": array } Return: array Since the model gets the schema from text in the prompt, you might have some flexibility in how you represent the schema. But when you supply a schema inline like this, the model is not actually constrained to return JSON. For a more deterministic, higher quality response, configure a schema on the model, and don't duplicate the schema in the text prompt. Generating enum values In some cases you might want the model to choose a single option from a list of options. To implement this behavior, you can pass an enum in your schema. You can use an enum option anywhere you could use a string in the responseSchema , because an enum is an array of strings. Like a JSON schema, an enum lets you constrain model output to meet the requirements of your application. \ No newline at end of file diff --git a/docstore/5e2fe627-41e6-4159-b17a-fe1e5c3bce96 b/docstore/5e2fe627-41e6-4159-b17a-fe1e5c3bce96 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/5e2fe627-41e6-4159-b17a-fe1e5c3bce96 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/5e4f515e-6f2e-40e6-a2b6-02ea9f67bc48 b/docstore/5e4f515e-6f2e-40e6-a2b6-02ea9f67bc48 new file mode 100644 index 0000000000000000000000000000000000000000..dd1226540612f04fb3f971567b47c61067071189 --- /dev/null +++ b/docstore/5e4f515e-6f2e-40e6-a2b6-02ea9f67bc48 @@ -0,0 +1 @@ +This example shows you how to specify a subject description. Subject description Prompt Generated output The description can include a subject, or multiple subjects and actions. Here, our subject is "white concrete apartment building." An architectural rendering of a white concrete apartment building with flowing organic shapes, seamlessly blending with lush greenery and futuristic elements Context This example shows you how to specify context. Context Prompt Generated output The background or context in which the subject will be placed is very important. Try placing your subject in a variety of backgrounds like on a busy street, or in outer space. A satellite floating through outer space with the moon and some stars in the background. Action This example shows you how to specify action. Action Prompt Generated output What is the subject doing like walking, running, or turning their head. A wide shot of a woman walking along the beach, looking content and relaxed towards the horizon at sunset. Style This example shows you how to specify style. Style Prompt Generated output You can add keywords to improve generation quality and steer it closer to intended style, such as shallow depth of field, movie still, minimalistic, surreal, vintage, futuristic, or double-exposure. Film noir style, man and woman walk on the street, mystery, cinematic, black and white. Camera motion This example shows you how to specify camera motion. Camera motion Prompt Generated output Options for camera motion include POV shot, aerial view, tracking drone view, or tracking shot. A POV shot from a vintage car driving in the rain, Canada at night, cinematic. Composition This example shows you how to specify composition. Composition Prompt Generated output How the shot is framed (wide shot, close-up, low angle). Extreme close-up of a an eye with city reflected in it. Create a video of a wide shot of surfer walking on a beach with a surfboard, beautiful sunset, cinematic. Ambiance This example \ No newline at end of file diff --git a/docstore/5e6409e9-2c36-4db4-b949-75e4d70b5784 b/docstore/5e6409e9-2c36-4db4-b949-75e4d70b5784 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/5e6409e9-2c36-4db4-b949-75e4d70b5784 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/5e82892b-6c1a-4b10-b442-cc86bff186af b/docstore/5e82892b-6c1a-4b10-b442-cc86bff186af new file mode 100644 index 0000000000000000000000000000000000000000..56f7dcc9bceb6e3744499a44a4d77c57b76426e8 --- /dev/null +++ b/docstore/5e82892b-6c1a-4b10-b442-cc86bff186af @@ -0,0 +1 @@ += "gemini-2.5-flash" , contents = [ "Explain how AI works" ], config = types . GenerateContentConfig ( temperature = 0.1 ) ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , config : { temperature : 0.1 , }, }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } temp := float32 ( 0.9 ) topP := float32 ( 0.5 ) topK := float32 ( 20.0 ) config := & genai . GenerateContentConfig { Temperature : & temp , TopP : & topP , TopK : & topK , ResponseMIMEType : "application/json" , } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "What is the average size of a swallow?" ), config , ) fmt . Println ( result . Text ()) } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ], "generationConfig": { "stopSequences": [ "Title" ], "temperature": 1.0, "topP": 0.8, "topK": 10 } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const generationConfig = { temperature : 1 , topP : 0.95 , topK : 40 , responseMimeType : 'text/plain' , }; const payload = { generationConfig , contents : [ { parts : [ { text : 'Explain how AI works in a few words' }, ], }, ], }; const url = \ No newline at end of file diff --git a/docstore/5e921d8e-1398-480d-a1f9-5544cfe275b7 b/docstore/5e921d8e-1398-480d-a1f9-5544cfe275b7 new file mode 100644 index 0000000000000000000000000000000000000000..771c2c741948f29f5c3605e7090d7f1d54bfcf1f --- /dev/null +++ b/docstore/5e921d8e-1398-480d-a1f9-5544cfe275b7 @@ -0,0 +1 @@ +For example, assume that you're developing an application to classify musical instruments into one of five categories: "Percussion" , "String" , "Woodwind" , "Brass" , or " "Keyboard" ". You could create an enum to help with this task. In the following example, you pass an enum as the responseSchema , constraining the model to choose the most appropriate option. Python from google import genai import enum class Instrument ( enum . Enum ): PERCUSSION = "Percussion" STRING = "String" WOODWIND = "Woodwind" BRASS = "Brass" KEYBOARD = "Keyboard" client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = 'What type of instrument is an oboe?' , config = { 'response_mime_type' : 'text/x.enum' , 'response_schema' : Instrument , }, ) print ( response . text ) # Woodwind JavaScript import { GoogleGenAI , Type } from "@google/genai" ; const ai = new GoogleGenAI ({}); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "What type of instrument is an oboe?" , config : { responseMimeType : "text/x.enum" , responseSchema : { type : Type . STRING , enum : [ "Percussion" , "String" , "Woodwind" , "Brass" , "Keyboard" ], }, }, }); console . log ( response . text ); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "contents": [{ "parts":[ { "text": "What type of instrument is an oboe?" } ] }], "generationConfig": { "responseMimeType": "text/x.enum", "responseSchema": { "type": "STRING", "enum": ["Percussion", "String", "Woodwind", "Brass", "Keyboard"] } } }' The Python library will translate the type declarations for the API. However, the API accepts a subset of the OpenAPI 3.0 schema ( Schema ). There are two other ways to specify an enumeration. You can use a Literal : ``` Python Literal [ "Percussion" , "String" , "Woodwind" , "Brass" , "Keyboard" ] \ No newline at end of file diff --git a/docstore/5eb04709-76cc-45f1-bd67-462a85c5932d b/docstore/5eb04709-76cc-45f1-bd67-462a85c5932d new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/5eb04709-76cc-45f1-bd67-462a85c5932d @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/5ece5544-79a6-4c27-b856-dde7dc53cb2c b/docstore/5ece5544-79a6-4c27-b856-dde7dc53cb2c new file mode 100644 index 0000000000000000000000000000000000000000..2e63962f4f91f045adaad4d78b0dd4e4935b1e33 --- /dev/null +++ b/docstore/5ece5544-79a6-4c27-b856-dde7dc53cb2c @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/vision#segmentation Title: Image understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/5ed94ef9-67b6-4543-9ec1-83d94b791e12 b/docstore/5ed94ef9-67b6-4543-9ec1-83d94b791e12 new file mode 100644 index 0000000000000000000000000000000000000000..43d235dee43f472c269052714c9705874463b9cd --- /dev/null +++ b/docstore/5ed94ef9-67b6-4543-9ec1-83d94b791e12 @@ -0,0 +1 @@ +tools ]) # Define user prompt contents = [ types . Content ( role = "user" , parts = [ types . Part ( text = "Turn the lights down to a romantic level" )] ) ] # Send request with function declarations response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = contents config = config , ) print ( response . candidates [ 0 ] . content . parts [ 0 ] . function_call ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Generation config with function declaration const config = { tools : [{ functionDeclarations : [ setLightValuesFunctionDeclaration ] }] }; // Configure the client const ai = new GoogleGenAI ({}); // Define user prompt const contents = [ { role : 'user' , parts : [{ text : 'Turn the lights down to a romantic level' }] } ]; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( response . functionCalls [ 0 ]); The model then returns a functionCall object in an OpenAPI compatible schema specifying how to call one or more of the declared functions in order to respond to the user's question. Python id = None args = { 'color_temp' : 'warm' , 'brightness' : 25 } name = 'set_light_values' JavaScript { name : 'set_light_values' , args : { brightness : 25 , color_temp : 'warm' } } Step 3: Execute set_light_values function code Extract the function call details from the model's response, parse the arguments , and execute the set_light_values function. Python # Extract tool call details, it may not be in the first part. tool_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call if tool_call . name == "set_light_values" : result = set_light_values ( ** tool_call . args ) print ( f "Function execution result: { result } " ) JavaScript // Extract tool call details const tool_call = response . functionCalls [ 0 ] let result ; if ( tool_call . name === 'set_light_values' ) { result = \ No newline at end of file diff --git a/docstore/5ee39f12-650c-4548-b1b4-b4abea00e10d b/docstore/5ee39f12-650c-4548-b1b4-b4abea00e10d new file mode 100644 index 0000000000000000000000000000000000000000..eb233fee8099f5f789dde3693dc446d13c990aff --- /dev/null +++ b/docstore/5ee39f12-650c-4548-b1b4-b4abea00e10d @@ -0,0 +1 @@ +UploadFile ( ... ) After (Centralized Client Object) Python from google import genai # Create a single client object client = genai . Client () # Access API methods through services on the client object response = client . models . generate_content ( ... ) chat = client . chats . create ( ... ) my_file = client . files . upload ( ... ) tuning_job = client . tunings . tune ( ... ) JavaScript import { GoogleGenAI } from "@google/genai" ; // Create a single client object const ai = new GoogleGenAI ({ apiKey : "YOUR_API_KEY" }); // Access API methods through services on the client object const response = await ai . models . generateContent (...); const chat = ai . chats . create (...); const uploadedFile = await ai . files . upload (...); const cache = await ai . caches . create (...); Go import "google.golang.org/genai" // Create a single client object client , err := genai . NewClient ( ctx , nil ) // Access API methods through services on the client object result , err := client . Models . GenerateContent ( ... ) chat , err := client . Chats . Create ( ... ) uploadedFile , err := client . Files . Upload ( ... ) tuningJob , err := client . Tunings . Tune ( ... ) Authentication Both legacy and new libraries authenticate using API keys. You can create your API key in Google AI Studio. Before Python The old SDK handled the API client object implicitly. import google.generativeai as genai genai . configure ( api_key =... ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); Go Import the Google libraries: import ( "github.com/google/generative-ai-go/genai" "google.golang.org/api/option" ) Create the client: client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) After Python With Google GenAI SDK, you create an API client first, which is used to call the API. The new SDK will pick up your API key from either one of the GEMINI_API_KEY or GOOGLE_API_KEY environment \ No newline at end of file diff --git a/docstore/5ee518b3-7681-46b2-b2e9-546740538e1f b/docstore/5ee518b3-7681-46b2-b2e9-546740538e1f new file mode 100644 index 0000000000000000000000000000000000000000..e8472db3f1b37b3dc2d13dd06406d0e42fc75dfb --- /dev/null +++ b/docstore/5ee518b3-7681-46b2-b2e9-546740538e1f @@ -0,0 +1 @@ +costing 258 tokens. Tips and best practices Verify that images are correctly rotated. Use clear, non-blurry images. When using a single image with text, place the text prompt after the image part in the contents array. What's next This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources: Files API : Learn more about uploading and managing files for use with Gemini. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/5f0767ef-be1c-4dda-9bed-b09ae1c244a3 b/docstore/5f0767ef-be1c-4dda-9bed-b09ae1c244a3 new file mode 100644 index 0000000000000000000000000000000000000000..33a8b238b28b3b4e6fb2252f6f1e5e7807510cc2 --- /dev/null +++ b/docstore/5f0767ef-be1c-4dda-9bed-b09ae1c244a3 @@ -0,0 +1 @@ +used to create the audio response: Native audio : This option provides the most natural and realistic-sounding speech and better multilingual performance. It also enables advanced features like affective (emotion-aware) dialogue , proactive audio (where the model can decide to ignore or respond to certain inputs), and "thinking" . Native audio is supported by the following native audio models : gemini-2.5-flash-preview-native-audio-dialog gemini-2.5-flash-exp-native-audio-thinking-dialog Half-cascade audio : This option uses a cascaded model architecture (native audio input and text-to-speech output). It offers better performance and reliability in production environments, especially with tool use . Half-cascaded audio is supported by the following models: gemini-live-2.5-flash-preview gemini-2.0-flash-live-001 Choose an implementation approach When integrating with Live API, you'll need to choose one of the following implementation approaches: Server-to-server : Your backend connects to the Live API using WebSockets . Typically, your client sends stream data (audio, video, text) to your server, which then forwards it to the Live API. Client-to-server : Your frontend code connects directly to the Live API using WebSockets to stream data, bypassing your backend. Note: Client-to-server generally offers better performance for streaming audio and video, since it bypasses the need to send the stream to your backend first. It's also easier to set up since you don't need to implement a proxy that sends data from your client to your server and then your server to the API. However, for production environments, in order to mitigate security risks, we recommend using ephemeral tokens instead of standard API keys. Get started This example reads a WAV file , sends it in the correct format, and saves the received data as WAV file. You can send audio by converting it to 16-bit PCM, 16kHz, mono format, and you can receive audio by setting AUDIO as response modality. The output uses \ No newline at end of file diff --git a/docstore/5f1077bb-1309-4c46-8103-0740e56fc21c b/docstore/5f1077bb-1309-4c46-8103-0740e56fc21c new file mode 100644 index 0000000000000000000000000000000000000000..c1215478fcfc0a791e03023378a8264733510dd8 --- /dev/null +++ b/docstore/5f1077bb-1309-4c46-8103-0740e56fc21c @@ -0,0 +1 @@ +connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.pcm" ); const base64Audio = Buffer . from ( fileBuffer ). toString ( 'base64' ); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); // if stream gets paused, send: // session.sendRealtimeInput({ audioStreamEnd: true }) const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); With send_realtime_input , the API will respond to audio automatically based on VAD. While send_client_content adds messages to the model context in order, send_realtime_input is optimized for responsiveness at the expense of deterministic ordering. Automatic VAD configuration For more control over the VAD activity, you can configure the following parameters. See API reference for more info. Python from google.genai import types config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : False , # default "start_of_speech_sensitivity" : types . StartSensitivity . START_SENSITIVITY_LOW , "end_of_speech_sensitivity" : types . EndSensitivity . END_SENSITIVITY_LOW , "prefix_padding_ms" : 20 , "silence_duration_ms" : 100 , } } } JavaScript import { GoogleGenAI , Modality , StartSensitivity , EndSensitivity } from '@google/genai' ; const config = { responseModalities : [ Modality . \ No newline at end of file diff --git a/docstore/5f1221f6-930b-4e06-bef4-bad402953d01 b/docstore/5f1221f6-930b-4e06-bef4-bad402953d01 new file mode 100644 index 0000000000000000000000000000000000000000..a8bbf59ddedb6c7213df1bd811fe6d2ce8316cfa --- /dev/null +++ b/docstore/5f1221f6-930b-4e06-bef4-bad402953d01 @@ -0,0 +1 @@ +. live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-exp-native-audio-thinking-dialog' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Voice Activity Detection (VAD) Voice Activity Detection (VAD) allows the model to recognize when a person is speaking. This is essential for creating natural conversations, as it allows a user to interrupt the model at any time. When VAD detects an interruption, the ongoing generation is canceled and discarded. Only the information already sent to the client is retained in the session history. The server then sends a BidiGenerateContentServerContent message to report the interruption. The Gemini server then discards any pending function calls and sends a BidiGenerateContentServerContent message with the IDs of the canceled calls. Python async for response in session . receive (): if response . server_content . interrupted is True : # The generation was interrupted # If realtime playback is implemented in your application, # you should stop playing audio and clear queued playback here. JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . interrupted ) { // The generation was interrupted // If realtime playback is implemented in your application, // you should stop playing audio and clear queued playback here. } } Automatic VAD By default, the model automatically performs VAD on a continuous audio input stream. VAD can be configured with the realtimeInputConfig.automaticActivityDetection field of the setup configuration . When the audio stream is paused for more than a second (for example, because the user switched off the microphone), an audioStreamEnd event \ No newline at end of file diff --git a/docstore/5f35c88f-318d-4452-b438-acf0ae2797b9 b/docstore/5f35c88f-318d-4452-b438-acf0ae2797b9 new file mode 100644 index 0000000000000000000000000000000000000000..4b6418baecebd23eec6598a4eb723dc1516263bd --- /dev/null +++ b/docstore/5f35c88f-318d-4452-b438-acf0ae2797b9 @@ -0,0 +1 @@ +default media resolution or 6 hours long at low media resolution, while models with a 1M context window can process videos up to 1 hour long at default media resolution or 3 hours long at low media resolution. File API processing : When using the File API, videos are sampled at 1 frame per second (FPS) and audio is processed at 1Kbps (single channel). Timestamps are added every second. These rates are subject to change in the future for improvements in inference. Token calculation : Each second of video is tokenized as follows: Individual frames (sampled at 1 FPS): If mediaResolution is set to low, frames are tokenized at 66 tokens per frame. Otherwise, frames are tokenized at 258 tokens per frame. Audio: 32 tokens per second. Metadata is also included. Total: Approximately 300 tokens per second of video at default media resolution, or 100 tokens per second of video at low media resolution. Timestamp format : When referring to specific moments in a video within your prompt, use the MM:SS format (e.g., 01:15 for 1 minute and 15 seconds). Best practices : Use only one video per prompt request for optimal results. If combining text and a single video, place the text prompt after the video part in the contents array. Be aware that fast action sequences might lose detail due to the 1 FPS sampling rate. Consider slowing down such clips if necessary. What's next This guide shows how to upload video files and generate text outputs from video inputs. To learn more, see the following resources: System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. Files API : Learn more about uploading and managing files for use with Gemini. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. \ No newline at end of file diff --git a/docstore/5f6972b4-3c66-47c6-b87b-9775f988544c b/docstore/5f6972b4-3c66-47c6-b87b-9775f988544c new file mode 100644 index 0000000000000000000000000000000000000000..203c1e44cd8be8f0144d13a11f43fb822930ab15 --- /dev/null +++ b/docstore/5f6972b4-3c66-47c6-b87b-9775f988544c @@ -0,0 +1 @@ +] // MCP Server }); const client = new Client ( { name : "example-client" , version : "1.0.0" } ); // Configure the client const ai = new GoogleGenAI ({}); // Initialize the connection between client and server await client . connect ( serverParams ); // Send request to the model with MCP tools const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : `What is the weather in London in ${ new Date (). toLocaleDateString () } ?` , config : { tools : [ mcpToTool ( client )], // uses the session, will automatically call the tool // Uncomment if you **don't** want the sdk to automatically call the tool // automaticFunctionCalling: { // disable: true, // }, }, }); console . log ( response . text ) // Close the connection await client . close (); Limitations with built-in MCP support Built-in MCP support is a experimental feature in our SDKs and has the following limitations: Only tools are supported, not resources nor prompts It is available for the Python and JavaScript/TypeScript SDK. Breaking changes might occur in future releases. Manual integration of MCP servers is always an option if these limit what you're building. Supported models This section lists models and their function calling capabilities. Experimental models are not included. You can find a comprehensive capabilities overview on the model overview page. Model Function Calling Parallel Function Calling Compositional Function Calling Gemini 2.5 Pro ✔️ ✔️ ✔️ Gemini 2.5 Flash ✔️ ✔️ ✔️ Gemini 2.5 Flash-Lite ✔️ ✔️ ✔️ Gemini 2.0 Flash ✔️ ✔️ ✔️ Gemini 2.0 Flash-Lite X X X Best practices Function and Parameter Descriptions: Be extremely clear and specific in your descriptions. The model relies on these to choose the correct function and provide appropriate arguments. Naming: Use descriptive function names (without spaces, periods, or dashes). Strong Typing: Use specific types (integer, string, enum) for parameters to reduce errors. If a parameter has a limited set of valid \ No newline at end of file diff --git a/docstore/5f7b3795-2280-4574-b2da-9219f002e61b b/docstore/5f7b3795-2280-4574-b2da-9219f002e61b new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/5f7b3795-2280-4574-b2da-9219f002e61b @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/5f7f7113-5e29-4d34-894a-afb5a8a5dc0b b/docstore/5f7f7113-5e29-4d34-894a-afb5a8a5dc0b new file mode 100644 index 0000000000000000000000000000000000000000..a8da0e92acb39cd3b0a95b76d425835072520bc1 --- /dev/null +++ b/docstore/5f7f7113-5e29-4d34-894a-afb5a8a5dc0b @@ -0,0 +1 @@ +Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/docstore/5f9ab7c0-f039-49bd-bbbf-efa4624d9c2a b/docstore/5f9ab7c0-f039-49bd-bbbf-efa4624d9c2a new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/5f9ab7c0-f039-49bd-bbbf-efa4624d9c2a @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/5f9da16b-28f8-4523-a3c4-5d3cb9791941 b/docstore/5f9da16b-28f8-4523-a3c4-5d3cb9791941 new file mode 100644 index 0000000000000000000000000000000000000000..1928fbda4690570381db2fc0734d5c40f27390c8 --- /dev/null +++ b/docstore/5f9da16b-28f8-4523-a3c4-5d3cb9791941 @@ -0,0 +1 @@ +Part { InlineData : & genai . Blob { MIMEType : "audio/mp3" , Data : audioBytes , }, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } A few things to keep in mind about inline audio data: The maximum request size is 20 MB, which includes text prompts, system instructions, and files provided inline. If your file's size will make the total request size exceed 20 MB, then use the Files API to upload an audio file for use in the request. If you're using an audio sample multiple times, it's more efficient to upload an audio file . Get a transcript To get a transcript of audio data, just ask for it in the prompt: Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) prompt = 'Generate a transcript of the speech.' response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ prompt , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Generate a transcript of the speech." , ]), }); console . log ( "result.text=" , result . text ); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } localAudioPath := "/path/to/sample.mp3" uploadedFile , _ := client . Files . UploadFromPath ( ctx , localAudioPath , nil , ) parts := [] * genai . Part { genai . NewPartFromText ( "Generate a transcript of the speech." ), \ No newline at end of file diff --git a/docstore/5fb45f1c-5687-4761-b052-11cba0e98776 b/docstore/5fb45f1c-5687-4761-b052-11cba0e98776 new file mode 100644 index 0000000000000000000000000000000000000000..8c60a97b59d947e95247d6e4ee3eb21605ab2ae3 --- /dev/null +++ b/docstore/5fb45f1c-5687-4761-b052-11cba0e98776 @@ -0,0 +1 @@ +open ( video_file_name , 'rb' ) . read () response = client . models . generate_content ( model = 'models/gemini-2.0-flash' , contents = types . Content ( parts = [ types . Part ( inline_data = types . Blob ( data = video_bytes , mime_type = 'video/mp4' ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const base64VideoFile = fs . readFileSync ( "path/to/small-sample.mp4" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "video/mp4" , data : base64VideoFile , }, }, { text : "Please summarize the video in 3 sentences." } ]; const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : contents , }); console . log ( response . text ); REST Note: If you get an Argument list too long error, the base64 encoding of your file might be too long for the curl command line. Use the File API method instead for larger files. VIDEO_PATH = /path/to/your/video.mp4 if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"video/mp4", "data": "' $( base64 $B64FLAGS $VIDEO_PATH ) '" } }, {"text": "Please summarize the video in 3 sentences."} ] }] }' 2 > /dev/null Include a YouTube URL Preview: The YouTube URL feature is in preview and is available at no charge. Pricing and rate limits are likely to change. The Gemini API and AI Studio support YouTube URLs as a file data Part . You can include a YouTube URL with a prompt asking the model to summarize, translate, or otherwise interact with the video content. Limitations: For the free tier, you can't upload more than 8 hours of \ No newline at end of file diff --git a/docstore/5fb4ddc1-1f4c-4d20-8e58-136ff728e3e8 b/docstore/5fb4ddc1-1f4c-4d20-8e58-136ff728e3e8 new file mode 100644 index 0000000000000000000000000000000000000000..ff0e917f3633351471582782b39463f2a8e6c8ed --- /dev/null +++ b/docstore/5fb4ddc1-1f4c-4d20-8e58-136ff728e3e8 @@ -0,0 +1 @@ +Gemini Developer API v.s. Vertex AI | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini Developer API v.s. Vertex AI When developing generative AI solutions with Gemini, Google offers two API products: the Gemini Developer API and the Vertex AI Gemini API . The Gemini Developer API provides the fastest path to build, productionize, and scale Gemini powered applications. Most developers should use the Gemini Developer API unless there is a need for specific enterprise controls. Vertex AI offers a comprehensive ecosystem of enterprise ready features and services for building and deploying generative AI applications backed by the Google Cloud Platform. We've recently simplified migrating between these services. Both the Gemini Developer API and the Vertex AI Gemini API are now accessible through the unified Google Gen AI SDK . Code comparison This page has side-by-side code comparisons between Gemini Developer API and Vertex AI quickstarts for text generation. Python You can access both the Gemini Developer API and Vertex AI services through the google-genai library. See the libraries page for instructions on how to install google-genai . Gemini Developer API from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = "Explain how AI works in a few words" ) print ( response . text ) Vertex AI Gemini API from google import genai client = genai . Client ( vertexai = True , project = 'your-project-id' , location = 'us-central1' ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = "Explain how AI works in a few words" ) \ No newline at end of file diff --git a/docstore/5fcea724-f9d2-4a66-97e1-2edc9ed68cdc b/docstore/5fcea724-f9d2-4a66-97e1-2edc9ed68cdc new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/5fcea724-f9d2-4a66-97e1-2edc9ed68cdc @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/5fdf3844-496d-411b-a8ce-d9decdc4bbdb b/docstore/5fdf3844-496d-411b-a8ce-d9decdc4bbdb new file mode 100644 index 0000000000000000000000000000000000000000..90311e83465fc930174e1787a7822757d628fc0a --- /dev/null +++ b/docstore/5fdf3844-496d-411b-a8ce-d9decdc4bbdb @@ -0,0 +1 @@ +setLightValues ( tool_call . args . brightness , tool_call . args . color_temp ); console . log ( `Function execution result: ${ JSON . stringify ( result ) } ` ); } Step 4: Create user friendly response with function result and call the model again Finally, send the result of the function execution back to the model so it can incorporate this information into its final response to the user. Python # Create a function response part function_response_part = types . Part . from_function_response ( name = tool_call . name , response = { "result" : result }, ) # Append function call and result of the function execution to contents contents . append ( response . candidates [ 0 ] . content ) # Append the content from the model's response. contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Create a function response part const function_response_part = { name : tool_call . name , response : { result } } // Append function call and result of the function execution to contents contents . push ( response . candidates [ 0 ]. content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); // Get the final response from the model const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); This completes the function calling flow. The model successfully used the set_light_values function to perform the request action of the user. Function declarations When you implement function calling in a prompt, you create a tools object, which contains one or more function declarations . You define functions using JSON, specifically with a select subset of the OpenAPI schema format. A single function \ No newline at end of file diff --git a/docstore/5fe73c43-7c96-4363-9bbc-32e7418a19ab b/docstore/5fe73c43-7c96-4363-9bbc-32e7418a19ab new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/5fe73c43-7c96-4363-9bbc-32e7418a19ab @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/5ff0c8cf-a276-41b4-9723-427e7f6036e2 b/docstore/5ff0c8cf-a276-41b4-9723-427e7f6036e2 new file mode 100644 index 0000000000000000000000000000000000000000..13654164ec6e7048af241ec2efbc514257c312b5 --- /dev/null +++ b/docstore/5ff0c8cf-a276-41b4-9723-427e7f6036e2 @@ -0,0 +1 @@ +For alternative methods of providing images and more advanced image processing, see our image understanding guide . The API also supports document , video , and audio inputs and understanding. Streaming responses By default, the model returns a response only after the entire generation process is complete. For more fluid interactions, use streaming to receive GenerateContentResponse instances incrementally as they're generated. Python from google import genai client = genai . Client () response = client . models . generate_content_stream ( model = "gemini-2.5-flash" , contents = [ "Explain how AI works" ] ) for chunk in response : print ( chunk . text , end = "" ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , }); for await ( const chunk of response ) { console . log ( chunk . text ); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } stream := client . Models . GenerateContentStream ( ctx , "gemini-2.5-flash" , genai . Text ( "Write a story about a magic backpack." ), nil , ) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt=sse" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ --no-buffer \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main \ No newline at end of file diff --git a/docstore/5ff126f4-8f07-4ada-a7ed-8777b6855a6e b/docstore/5ff126f4-8f07-4ada-a7ed-8777b6855a6e new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/5ff126f4-8f07-4ada-a7ed-8777b6855a6e @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/5ff2b5d9-c41a-4990-b19e-32e5544082a6 b/docstore/5ff2b5d9-c41a-4990-b19e-32e5544082a6 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/5ff2b5d9-c41a-4990-b19e-32e5544082a6 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/5ff7fb37-4c82-4a3e-b3cb-69671f1ce443 b/docstore/5ff7fb37-4c82-4a3e-b3cb-69671f1ce443 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/5ff7fb37-4c82-4a3e-b3cb-69671f1ce443 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/60443c45-e0c7-4b43-99c7-4d8ba9b7c1f0 b/docstore/60443c45-e0c7-4b43-99c7-4d8ba9b7c1f0 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/60443c45-e0c7-4b43-99c7-4d8ba9b7c1f0 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/605ad26c-0983-428c-b228-968c5ddc103e b/docstore/605ad26c-0983-428c-b228-968c5ddc103e new file mode 100644 index 0000000000000000000000000000000000000000..4698c2cf5d2dc524303259a813fe032a26136eee --- /dev/null +++ b/docstore/605ad26c-0983-428c-b228-968c5ddc103e @@ -0,0 +1 @@ +blurring the background into a sea of neon colors and indistinct shadows, creating a sense of urgency and isolation. A more detailed prompt results in a video that is more focused with a richer environment. A video with smooth motion that dollies in on a desperate man in a green trench coat, using a vintage rotary phone against a wall bathed in an eerie green neon glow. The camera starts from a medium distance, slowly moving closer to the man's face, revealing his frantic expression and the sweat on his brow as he urgently dials the phone. The focus is on the man's hands, his fingers fumbling with the dial as he desperately tries to connect. The green neon light casts long shadows on the wall, adding to the tense atmosphere. The scene is framed to emphasize the isolation and desperation of the man, highlighting the stark contrast between the vibrant glow of the neon and the man's grim determination. Adding more detail gives the subject a realistic expression and creates an intense and vibrant scene. Snow leopard This example demonstrates the output Veo might generate for a simple prompt. Prompt Generated output A cute creature with snow leopard-like fur is walking in winter forest, 3D cartoon style render. Running snow leopard This prompt has more detail and demonstrates generated output that might be closer to what you want in your video. Prompt Generated output Create a short 3D animated scene in a joyful cartoon style. A cute creature with snow leopard-like fur, large expressive eyes, and a friendly, rounded form happily prances through a whimsical winter forest. The scene should feature rounded, snow-covered trees, gentle falling snowflakes, and warm sunlight filtering through the branches. The creature's bouncy movements and wide smile should convey pure delight. Aim for an upbeat, heartwarming tone with bright, cheerful colors and playful animation. Examples by writing elements These examples show you how to refine your prompts by each basic element. Subject \ No newline at end of file diff --git a/docstore/605c58dc-413c-4797-8d60-f5e1b3a06430 b/docstore/605c58dc-413c-4797-8d60-f5e1b3a06430 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/605c58dc-413c-4797-8d60-f5e1b3a06430 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/60607702-c78e-4999-b6cd-9305f737a059 b/docstore/60607702-c78e-4999-b6cd-9305f737a059 new file mode 100644 index 0000000000000000000000000000000000000000..448fb0d86f65fba6c1ebada31e49a4124dab31aa --- /dev/null +++ b/docstore/60607702-c78e-4999-b6cd-9305f737a059 @@ -0,0 +1 @@ +message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "16000.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . log ( "Transcription" ) console . log ( turn . serverContent . outputTranscription . text ); } } for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } else if ( turn . serverContent && turn . serverContent . inputTranscription ) { console . debug ( 'Received input transcription: %s\n' , turn . serverContent . inputTranscription . text ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Stream audio and video To see an example of how to use the Live API in a streaming audio and video format, run the "Live API - Get Started" file in the cookbooks repository: View on Colab Change voice and language The Live API models each support a different set of voices. Half-cascade supports Puck, Charon, Kore, Fenrir, Aoede, Leda, Orus, and Zephyr. Native audio supports a much longer list (identical to the TTS model list \ No newline at end of file diff --git a/docstore/60657822-6c4d-4b04-90e1-c453c86b5b12 b/docstore/60657822-6c4d-4b04-90e1-c453c86b5b12 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/60657822-6c4d-4b04-90e1-c453c86b5b12 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/607c8824-71a5-4ada-8033-d7ae847200cd b/docstore/607c8824-71a5-4ada-8033-d7ae847200cd new file mode 100644 index 0000000000000000000000000000000000000000..ea2e919435dd65879cd5a9105357abfb2c8ba699 --- /dev/null +++ b/docstore/607c8824-71a5-4ada-8033-d7ae847200cd @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.5-pro Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/609206cf-b1a2-48ca-9b61-a3497c35754d b/docstore/609206cf-b1a2-48ca-9b61-a3497c35754d new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/609206cf-b1a2-48ca-9b61-a3497c35754d @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/60b3c69c-9649-46c9-be32-54c12e9aff9f b/docstore/60b3c69c-9649-46c9-be32-54c12e9aff9f new file mode 100644 index 0000000000000000000000000000000000000000..1749162b760803024b206e202dc9d02405afd1c5 --- /dev/null +++ b/docstore/60b3c69c-9649-46c9-be32-54c12e9aff9f @@ -0,0 +1 @@ +Google employees via an internal governance assessment and review management platform. When data is logged for abuse monitoring, it is used solely for the purpose of policy enforcement and is not used to train or fine-tune any AI/ML models. Working with You on Policy Compliance If your use of Gemini doesn't align with our policies, we may take the following steps: Get in touch: We may reach out to you through email to understand your use case and explore ways to bring your usage into compliance. Temporary usage limits: We may limit your access to the Gemini API. Temporary suspension: We may temporarily pause your access to the Gemini API. Account closure: As a last resort, and for serious violations, we may permanently close your access to the Gemini API and other Google services. Scope These policy guidelines apply to the use of the Gemini API and AI Studio. Inline Preference Voting In Google AI Studio, you might occasionally see a side-by-side comparison of two different responses to your prompt. This is part of our Inline Preference Voting system. You'll be asked to choose which response you prefer. This helps us understand which model outputs users find most helpful. Why are we doing this? We're constantly working to improve our AI models and services. Your feedback through Inline Preference Voting helps us provide, improve, and develop Google products and services and machine learning technologies, including Google's enterprise features, products and services, consistent with the Gemini API Additional Terms of Service and Privacy Policy . What data is included in Feedback? To make informed decisions about our models, we collect certain data when you participate in Inline Preference Voting: Prompts and Responses: We record all prompts and responses, including any uploaded content, in the conversation you submitted feedback about. We also record the two response options that you selected from. This helps us understand the context of your preference. Your Vote: We \ No newline at end of file diff --git a/docstore/60bb0bbf-38a6-484b-a6fa-7048a95555a0 b/docstore/60bb0bbf-38a6-484b-a6fa-7048a95555a0 new file mode 100644 index 0000000000000000000000000000000000000000..fc6b27785ffc2bef06b5f2067401216e3ff34ff1 --- /dev/null +++ b/docstore/60bb0bbf-38a6-484b-a6fa-7048a95555a0 @@ -0,0 +1 @@ +"cheeseburger" was excluded from the output because it wasn't a part of the order. Constraints Specify any constraints on reading the prompt or generating a response. You can tell the model what to do and not to do. For example, you can specify a constraint in the prompt on how long you want a summary to be: Prompt: Summarize this text in one sentence: Text: A quantum computer exploits quantum mechanical phenomena to perform calculations exponentially faster than any modern traditional computer. At very tiny scales, physical matter acts as both particles and as waves, and quantum computing uses specialized hardware to leverage this behavior. The operating principles of quantum devices is beyond the scope of classical physics. When deployed at scale, quantum computers could be used in a wide variety of applications such as: in cybersecurity to break existing encryption methods while helping researchers create new ones, in meteorology to develop better weather forecasting etc. However, the current state of the art quantum computers are still largely experimental and impractical. Response: Exploiting quantum mechanical phenomena, quantum computers can perform calculations exponentially faster than traditional computers for potential applications like cybersecurity and meteorology, although they are currently largely experimental and impractical. (gemini-2.5-flash) Response format You can give instructions that specify the format of the response. For example, you can ask for the response to be formatted as a table, bulleted list, elevator pitch, keywords, sentence, or paragraph. The following system instruction tells the model to be more conversational in its response: System instruction All questions should be answered comprehensively with details, unless the user requests a concise response specifically. Prompt What is a smart way to make a business that sells DVD's in 2025? Response: Starting a business selling DVDs in 2025 presents significant challenges due to the \ No newline at end of file diff --git a/docstore/60d60f6d-1390-4a38-93ae-624e28b399f6 b/docstore/60d60f6d-1390-4a38-93ae-624e28b399f6 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/60d60f6d-1390-4a38-93ae-624e28b399f6 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/612451df-018f-47c9-9b40-456a5303edbc b/docstore/612451df-018f-47c9-9b40-456a5303edbc new file mode 100644 index 0000000000000000000000000000000000000000..0d6f2bf32b918d3f85636fdb1c53070e3d5509f6 --- /dev/null +++ b/docstore/612451df-018f-47c9-9b40-456a5303edbc @@ -0,0 +1 @@ +overlay_filename = f " { item [ 'label' ] } _ { i } _overlay.png" mask . save ( os . path . join ( output_dir , mask_filename )) # Create and save overlay composite = Image . alpha_composite ( im . convert ( 'RGBA' ), overlay ) composite . save ( os . path . join ( output_dir , overlay_filename )) print ( f "Saved mask and overlay for { item [ 'label' ] } to { output_dir } " ) # Example usage if __name__ == "__main__" : extract_segmentation_masks ( "path/to/image.png" ) Check the segmentation example in the cookbook guide for a more detailed example. An example segmentation output with objects and segmentation masks Supported image formats Gemini supports the following image format MIME types: PNG - image/png JPEG - image/jpeg WEBP - image/webp HEIC - image/heic HEIF - image/heif Capabilities All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation. Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements. Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities: Gemini 2.0 models are further trained to support enhanced object detection . Gemini 2.5 models are further trained to support enhanced segmentation in addition to object detection . Limitations and key technical information File limit Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request. Token calculation Gemini 1.5 Flash and Gemini 1.5 Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens. Gemini 2.0 Flash and Gemini 2.5 Flash/Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each \ No newline at end of file diff --git a/docstore/614e3663-658c-41b1-ba96-27be35793d69 b/docstore/614e3663-658c-41b1-ba96-27be35793d69 new file mode 100644 index 0000000000000000000000000000000000000000..c19e2d1f2a8624bd1e7529d33c658d3154e40942 --- /dev/null +++ b/docstore/614e3663-658c-41b1-ba96-27be35793d69 @@ -0,0 +1 @@ +response. A token is approximately four characters. 100 tokens correspond to roughly 60-80 words. Temperature: The temperature controls the degree of randomness in token selection. The temperature is used for sampling during response generation, which occurs when topP and topK are applied. Lower temperatures are good for prompts that require a more deterministic or less open-ended response, while higher temperatures can lead to more diverse or creative results. A temperature of 0 is deterministic, meaning that the highest probability response is always selected. topK : The topK parameter changes how the model selects tokens for output. A topK of 1 means the selected token is the most probable among all the tokens in the model's vocabulary (also called greedy decoding), while a topK of 3 means that the next token is selected from among the 3 most probable using the temperature. For each token selection step, the topK tokens with the highest probabilities are sampled. Tokens are then further filtered based on topP with the final token selected using temperature sampling. topP : The topP parameter changes how the model selects tokens for output. Tokens are selected from the most to least probable until the sum of their probabilities equals the topP value. For example, if tokens A, B, and C have a probability of 0.3, 0.2, and 0.1 and the topP value is 0.5, then the model will select either A or B as the next token by using the temperature and exclude C as a candidate. The default topP value is 0.95. stop_sequences : Set a stop sequence to tell the model to stop generating content. A stop sequence can be any sequence of characters. Try to avoid using a sequence of characters that may appear in the generated content. Prompt iteration strategies Prompt design can sometimes require a few iterations before you consistently get the response you're looking for. This section provides guidance on some things you can try when iterating on your prompts: Use different phrasing: \ No newline at end of file diff --git a/docstore/61531f2f-db87-4bda-90ce-4e91f084a3fb b/docstore/61531f2f-db87-4bda-90ce-4e91f084a3fb new file mode 100644 index 0000000000000000000000000000000000000000..3d241d4ff001a9868e67072728eae208556221c7 --- /dev/null +++ b/docstore/61531f2f-db87-4bda-90ce-4e91f084a3fb @@ -0,0 +1 @@ +"https://vertexaisearch.cloud.google.com/grounding-api-redirect/1234567890abcdef" , "url_retrieval_status" : < UrlRe tr ievalS tatus .URL_RETRIEVAL_STATUS_SUCCESS : "URL_RETRIEVAL_STATUS_SUCCESS" > }, { "retrieved_url" : "https://vertexaisearch.cloud.google.com/grounding-api-redirect/abcdef1234567890" , "url_retrieval_status" : < UrlRe tr ievalS tatus .URL_RETRIEVAL_STATUS_SUCCESS : "URL_RETRIEVAL_STATUS_SUCCESS" > }, { "retrieved_url" : " YOUR_URL " , "url_retrieval_status" : < UrlRe tr ievalS tatus .URL_RETRIEVAL_STATUS_SUCCESS : "URL_RETRIEVAL_STATUS_SUCCESS" > }, { "retrieved_url" : "https://vertexaisearch.cloud.google.com/grounding-api-redirect/fedcba0987654321" , "url_retrieval_status" : < UrlRe tr ievalS tatus .URL_RETRIEVAL_STATUS_SUCCESS : "URL_RETRIEVAL_STATUS_SUCCESS" > } ] } } } Supported models gemini-2.5-pro gemini-2.5-flash gemini-2.5-flash-lite gemini-2.0-flash gemini-2.0-flash-live-001 Limitations The tool will consume up to 20 URLs per request for analysis. For best results during experimental phase, use the tool on standard web pages rather than multimedia content such as YouTube videos. During experimental phase, the tool is free to use. Billing to come later. The experimental release has the following quotas: 1500 queries per day per project for requests made through the Gemini API 100 queries per day per user in Google AI Studio Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/6168a81c-4796-4eb2-a877-4e7ac924f60a b/docstore/6168a81c-4796-4eb2-a877-4e7ac924f60a new file mode 100644 index 0000000000000000000000000000000000000000..4a588ba82271aaa08f2df12295049c23bd57f7eb --- /dev/null +++ b/docstore/6168a81c-4796-4eb2-a877-4e7ac924f60a @@ -0,0 +1 @@ +, ) print ( response . text ) # The SDK handles the function call and returns the final text You can disable automatic function calling with: Python config = types . GenerateContentConfig ( tools = [ get_current_temperature ], automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ) ) Automatic function schema declaration Automatic schema extraction from Python functions doesn't work in all cases. For example, it doesn't handle cases where you describe the fields of a nested dictionary-object. The API is able to describe any of the following types: Python AllowedType = ( int | float | bool | str | list [ 'AllowedType' ] | dict [ str , AllowedType ]) To see what the inferred schema looks like, you can convert it using from_callable : Python def multiply ( a : float , b : float ): """Returns a * b.""" return a * b fn_decl = types . FunctionDeclaration . from_callable ( callable = multiply , client = client ) # to_json_dict() provides a clean JSON representation. print ( fn_decl . to_json_dict ()) Multi-tool use: Combine native tools with function calling You can enable multiple tools combining native tools with function calling at the same time. Here's an example that enables two tools, Grounding with Google Search and code execution , in a request using the Live API . Note: Multi-tool use is a- Live API only feature at the moment. The run() function declaration, which handles the asynchronous websocket setup, is omitted for brevity. Python # Multiple tasks example - combining lights, code execution, and search prompt = """ Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! """ tools = [ { 'google_search' : {}}, { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} # not defined here. \ No newline at end of file diff --git a/docstore/617c8ce7-3bd6-4c17-a6b5-c09fa29974b7 b/docstore/617c8ce7-3bd6-4c17-a6b5-c09fa29974b7 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/617c8ce7-3bd6-4c17-a6b5-c09fa29974b7 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/618c408b-4ce7-4f82-8780-1d97946b4cd9 b/docstore/618c408b-4ce7-4f82-8780-1d97946b4cd9 new file mode 100644 index 0000000000000000000000000000000000000000..9a3ae8e54d036eb9d08cf51953b4e3479c03ffae --- /dev/null +++ b/docstore/618c408b-4ce7-4f82-8780-1d97946b4cd9 @@ -0,0 +1 @@ +Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/619c4e49-c2ac-40db-947e-6dfaef432a9c b/docstore/619c4e49-c2ac-40db-947e-6dfaef432a9c new file mode 100644 index 0000000000000000000000000000000000000000..2c07bab3094a44e574749f43c7912d190e99a306 --- /dev/null +++ b/docstore/619c4e49-c2ac-40db-947e-6dfaef432a9c @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#imagen-4 Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/61a880e2-329d-48c8-9624-f39642f3f55d b/docstore/61a880e2-329d-48c8-9624-f39642f3f55d new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/61a880e2-329d-48c8-9624-f39642f3f55d @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/61b5fb5c-d95b-4efc-9f5c-a5b516413513 b/docstore/61b5fb5c-d95b-4efc-9f5c-a5b516413513 new file mode 100644 index 0000000000000000000000000000000000000000..36b0f0f8a4df60acd9dd94249f5fced4282af350 --- /dev/null +++ b/docstore/61b5fb5c-d95b-4efc-9f5c-a5b516413513 @@ -0,0 +1 @@ +Get started with Live API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Get started with Live API Preview: The Live API is in preview. The Live API enables low-latency, real-time voice and video interactions with Gemini. It processes continuous streams of audio, video, or text to deliver immediate, human-like spoken responses, creating a natural conversational experience for your users. Live API offers a comprehensive set of features such as Voice Activity Detection , tool use and function calling , session management (for managing long running conversations) and ephemeral tokens (for secure client-sided authentication). This page gets you up and running with examples and basic code samples. Example applications Check out the following example applications that illustrate how to use Live API for end-to-end use cases: Live audio starter app on AI Studio, using JavaScript libraries to connect to Live API and stream bidirectional audio through your microphone and speakers. Live API Python cookbook using Pyaudio that connects to Live API. Partner integrations If you prefer a simpler development process, you can use Daily or LiveKit . These are third-party partner platforms that have already integrated the Gemini Live API over the WebRTC protocol to streamline the development of real-time audio and video applications. Before you begin building There are two important decisions to make before you begin building with the Live API: choosing a model and choosing an implementation approach. Choose an audio generation architecture If you're building an audio-based use case, your choice of model determines the audio generation architecture \ No newline at end of file diff --git a/docstore/61c09423-2971-4344-8aa4-fb6d8254eec0 b/docstore/61c09423-2971-4344-8aa4-fb6d8254eec0 new file mode 100644 index 0000000000000000000000000000000000000000..6b49de6482bf764efc991923a68119bf33fab745 --- /dev/null +++ b/docstore/61c09423-2971-4344-8aa4-fb6d8254eec0 @@ -0,0 +1 @@ +sessions using the token from this request ( newSessionExpireTime ), and 30 minutes to send messages over that connection ( expireTime ). Python import datetime now = datetime . datetime . now ( tz = datetime . timezone . utc ) client = genai . Client ( http_options = { 'api_version' : 'v1alpha' ,} ) token = client . auth_tokens . create ( config = { 'uses' : 1 , # The ephemeral token can only be used to start a single session 'expire_time' : now + datetime . timedelta ( minutes = 30 ), # Default is 30 minutes in the future # 'expire_time': '2025-05-17T00:00:00Z', # Accepts isoformat. 'new_session_expire_time' : now + datetime . timedelta ( minutes = 1 ), # Default 1 minute in the future 'http_options' : { 'api_version' : 'v1alpha' }, } ) # You'll need to pass the value under token.name back to your client to use it JavaScript import { GoogleGenAI } from "@google/genai" ; const client = new GoogleGenAI ({}); const expireTime = new Date ( Date . now () + 30 * 60 * 1000 ). toISOString (); const token : AuthToken = await client . authTokens . create ({ config : { uses : 1 , // The default expireTime : expireTime // Default is 30 mins newSessionExpireTime : new Date ( Date . now () + ( 1 * 60 * 1000 )), // Default 1 minute in the future httpOptions : { apiVersion : 'v1alpha' }, }, }); For expireTime value constraints, defaults, and other field specs, see the API reference . Within the expireTime timeframe, you'll need sessionResumption to reconnect the call every 10 minutes (this can be done with the same token even if uses: 1 ). It's also possible to lock an ephemeral token to a set of configurations. This might be useful to further improve security of your application and keep your system instructions on the server side. Python client = genai . Client ( http_options = { 'api_version' : 'v1alpha' ,} ) token = client . auth_tokens . create ( config = { 'uses' : 1 , 'live_connect_constraints' : { 'model' : 'gemini-2.0-flash-live-001' , 'config' : { 'session_resumption' \ No newline at end of file diff --git a/docstore/61c514e0-abb2-405d-9106-a90fcd8ddc99 b/docstore/61c514e0-abb2-405d-9106-a90fcd8ddc99 new file mode 100644 index 0000000000000000000000000000000000000000..7902f08c6718459a22ed8e1c39d9cd74e8922733 --- /dev/null +++ b/docstore/61c514e0-abb2-405d-9106-a90fcd8ddc99 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting-strategies Title: Prompt design strategies | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/61cc3fd5-ae62-470a-bb5e-d771cfa1521a b/docstore/61cc3fd5-ae62-470a-bb5e-d771cfa1521a new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/61cc3fd5-ae62-470a-bb5e-d771cfa1521a @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/61deb4ad-eb09-402e-ba0d-d73febe5089f b/docstore/61deb4ad-eb09-402e-ba0d-d73febe5089f new file mode 100644 index 0000000000000000000000000000000000000000..a93f111d87cca3d0226fc4cdf96775703aa03950 --- /dev/null +++ b/docstore/61deb4ad-eb09-402e-ba0d-d73febe5089f @@ -0,0 +1 @@ +genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMAGE_PATH = "path/to/sample.jpg" MIME_TYPE = $( file -b --mime-type " ${ IMAGE_PATH } " ) NUM_BYTES = $( wc -c < " ${ IMAGE_PATH } " ) DISPLAY_NAME = IMAGE tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Caption this image."}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Prompting with multiple images You can provide multiple images in a \ No newline at end of file diff --git a/docstore/61e4575a-57a5-4165-95ad-aeeb61379cd8 b/docstore/61e4575a-57a5-4165-95ad-aeeb61379cd8 new file mode 100644 index 0000000000000000000000000000000000000000..6b1a11d386f4b560f93e6fc6fce6c7f46a05bdf0 --- /dev/null +++ b/docstore/61e4575a-57a5-4165-95ad-aeeb61379cd8 @@ -0,0 +1 @@ +responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } else if ( message . toolCall ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Compute the largest prime palindrome under 100000.' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . modelTurn && turn . serverContent . modelTurn . parts ) { for ( const part of turn . serverContent . modelTurn . parts ) { if ( part . text ) { console . debug ( 'Received text: %s\n' , part . text ); } else if ( part . executableCode ) { console . debug ( 'executableCode: %s\n' , part . executableCode . code ); } else if ( part . codeExecutionResult ) { console . debug ( 'codeExecutionResult: %s\n' , part . codeExecutionResult . output ); } } } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Grounding with Google Search You can enable Grounding with Google Search as part of the session configuration. This increases the Live API's accuracy and prevents hallucinations. See the Grounding tutorial to learn more. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = \ No newline at end of file diff --git a/docstore/6200c4ea-22ae-47a8-a70b-677887dc462c b/docstore/6200c4ea-22ae-47a8-a70b-677887dc462c new file mode 100644 index 0000000000000000000000000000000000000000..086087b92745c4e9b643012a145762582d8d8dd5 --- /dev/null +++ b/docstore/6200c4ea-22ae-47a8-a70b-677887dc462c @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "What' \' 's the temperature in London?" } ] } ], "tools": [ { "functionDeclarations": [ { "name": "get_current_temperature", "description": "Gets the current temperature for a given location.", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "The city name, e.g. San Francisco" } }, "required": ["location"] } } ] } ] }' How function calling works Function calling involves a structured interaction between your application, the model, and external functions. Here's a breakdown of the process: Define Function Declaration: Define the function declaration in your application code. Function Declarations describe the function's name, parameters, and purpose to the model. Call LLM with function declarations: Send user prompt along with the function declaration(s) to the model. It analyzes the request and determines if a function call would be helpful. If so, it responds with a structured JSON object. Execute Function Code (Your Responsibility): The Model does not execute the function itself. It's your application's responsibility to process the response and check for Function Call, if Yes : Extract the name and args of the function and execute the corresponding function in your application. No: The model has provided a direct text response to the prompt (this flow is less emphasized in the example but is a possible outcome). Create User friendly response: If a function was executed, capture the result and send it back to the model in a subsequent turn of the conversation. It will use the result to generate a final, user-friendly response that incorporates the information from the function call. This process can be repeated over multiple turns, allowing for complex interactions and \ No newline at end of file diff --git a/docstore/62032ad8-be87-4b12-9ddf-17c4eca93826 b/docstore/62032ad8-be87-4b12-9ddf-17c4eca93826 new file mode 100644 index 0000000000000000000000000000000000000000..baadc60a348efbee6af3b697c8c6a5805e59ae7f --- /dev/null +++ b/docstore/62032ad8-be87-4b12-9ddf-17c4eca93826 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/files#prompt-guide Title: Files API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/62129947-b001-4b49-ac7a-33652e01b507 b/docstore/62129947-b001-4b49-ac7a-33652e01b507 new file mode 100644 index 0000000000000000000000000000000000000000..bdc0108574722a98196635732e5c499babc817a5 --- /dev/null +++ b/docstore/62129947-b001-4b49-ac7a-33652e01b507 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/files#main-content Title: Files API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/6223a36d-f20e-42bf-8124-2db4478a111c b/docstore/6223a36d-f20e-42bf-8124-2db4478a111c new file mode 100644 index 0000000000000000000000000000000000000000..ea67dfb950f5bf1f1ddc040da181eb1accb5974b --- /dev/null +++ b/docstore/6223a36d-f20e-42bf-8124-2db4478a111c @@ -0,0 +1 @@ +breakdown:" ) for detail in usage . response_tokens_details : match detail : case types . ModalityTokenCount ( modality = modality , token_count = count ): print ( f " { modality } : { count } " ) JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . usageMetadata ) { console . debug ( 'Used %s tokens in total. Response token breakdown:\n' , turn . usageMetadata . totalTokenCount ); for ( const detail of turn . usageMetadata . responseTokensDetails ) { console . debug ( '%s\n' , detail ); } } } Media resolution You can specify the media resolution for the input media by setting the mediaResolution field as part of the session configuration: Python from google.genai import types config = { "response_modalities" : [ "AUDIO" ], "media_resolution" : types . MediaResolution . MEDIA_RESOLUTION_LOW , } JavaScript import { GoogleGenAI , Modality , MediaResolution } from '@google/genai' ; const config = { responseModalities : [ Modality . TEXT ], mediaResolution : MediaResolution . MEDIA_RESOLUTION_LOW , }; Limitations Consider the following limitations of the Live API when you plan your project. Response modalities You can only set one response modality ( TEXT or AUDIO ) per session in the session configuration. Setting both results in a config error message. This means that you can configure the model to respond with either text or audio, but not both in the same session. Client authentication The Live API only provides server-to-server authentication by default. If you're implementing your Live API application using a client-to-server approach , you need to use ephemeral tokens to mitigate security risks. Session duration Audio-only sessions are limited to 15 minutes, and audio plus video sessions are limited to 2 minutes. However, you can configure different session management techniques for unlimited extensions on session duration. Context window A session has a context window limit of: 128k tokens for native audio output models 32k \ No newline at end of file diff --git a/docstore/6238b817-2824-4174-8106-c60c631496f3 b/docstore/6238b817-2824-4174-8106-c60c631496f3 new file mode 100644 index 0000000000000000000000000000000000000000..e83e80a217e9ff779ee42e75d11ac96b01a2f185 --- /dev/null +++ b/docstore/6238b817-2824-4174-8106-c60c631496f3 @@ -0,0 +1 @@ +function_calling_config = types . FunctionCallingConfig ( mode = "ANY" , allowed_function_names = [ "get_current_temperature" ] ) ) # Create the generation config config = types . GenerateContentConfig ( tools = [ tools ], # not defined here. tool_config = tool_config , ) JavaScript import { FunctionCallingConfigMode } from '@google/genai' ; // Configure function calling mode const toolConfig = { functionCallingConfig : { mode : FunctionCallingConfigMode . ANY , allowedFunctionNames : [ 'get_current_temperature' ] } }; // Create the generation config const config = { tools : tools , // not defined here. toolConfig : toolConfig , }; Automatic function calling (Python only) When using the Python SDK, you can provide Python functions directly as tools. The SDK automatically converts the Python function to declarations, handles the function call execution and the response cycle for you. The Python SDK then automatically: Detects function call responses from the model. Call the corresponding Python function in your code. Sends the function response back to the model. Returns the model's final text response. To use this, define your function with type hints and a docstring, and then pass the function itself (not a JSON declaration) as a tool: Python from google import genai from google.genai import types # Define the function with type hints and docstring def get_current_temperature ( location : str ) - > dict : """Gets the current temperature for a given location. Args: location: The city and state, e.g. San Francisco, CA Returns: A dictionary containing the temperature and unit. """ # ... (implementation) ... return { "temperature" : 25 , "unit" : "Celsius" } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_current_temperature ] ) # Pass the function itself # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What's the temperature in Boston?" , config = config \ No newline at end of file diff --git a/docstore/624ae81a-76fa-4191-9ee7-6663ec521cbe b/docstore/624ae81a-76fa-4191-9ee7-6663ec521cbe new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/624ae81a-76fa-4191-9ee7-6663ec521cbe @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/624df49a-36cf-4778-8cc8-a789964b824f b/docstore/624df49a-36cf-4778-8cc8-a789964b824f new file mode 100644 index 0000000000000000000000000000000000000000..ba28dbc7ce7bec083cb930373e6fd594666aa933 --- /dev/null +++ b/docstore/624df49a-36cf-4778-8cc8-a789964b824f @@ -0,0 +1 @@ +my house?" , }); console . log ( "Chat response 2:" , response2 . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) res , _ := chat . SendMessage ( ctx , genai . Part { Text : "How many paws are in my house?" }) if len ( res . Candidates ) > 0 { fmt . Println ( res . Candidates [ 0 ]. Content . Parts [ 0 ]. Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' \ No newline at end of file diff --git a/docstore/6258256c-5f95-46fc-bc3b-e978be8fda99 b/docstore/6258256c-5f95-46fc-bc3b-e978be8fda99 new file mode 100644 index 0000000000000000000000000000000000000000..bb8a5c387274cea53762666893b97a549fe37a5c --- /dev/null +++ b/docstore/6258256c-5f95-46fc-bc3b-e978be8fda99 @@ -0,0 +1 @@ +Client () response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = "Provide a list of 3 famous physicists and their key contributions" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 1024 ) # Turn off thinking: # thinking_config=types.ThinkingConfig(thinking_budget=0) # Turn on dynamic thinking: # thinking_config=types.ThinkingConfig(thinking_budget=-1) ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : "Provide a list of 3 famous physicists and their key contributions" , config : { thinkingConfig : { thinkingBudget : 1024 , // Turn off thinking: // thinkingBudget: 0 // Turn on dynamic thinking: // thinkingBudget: -1 }, }, }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "google.golang.org/genai" "os" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } thinkingBudgetVal := int32 ( 1024 ) contents := genai . Text ( "Provide a list of 3 famous physicists and their key contributions" ) model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : & thinkingBudgetVal , // Turn off thinking: // ThinkingBudget: int32(0), // Turn on dynamic thinking: // ThinkingBudget: int32(-1), }, }) fmt . Println ( resp . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Provide a list of 3 famous physicists and their key contributions" } ] } ], "generationConfig": { \ No newline at end of file diff --git a/docstore/628490eb-bb69-41cc-9908-c5ea12d16e3d b/docstore/628490eb-bb69-41cc-9908-c5ea12d16e3d new file mode 100644 index 0000000000000000000000000000000000000000..acef50f8402c486c348013efe146de15b88cb32b --- /dev/null +++ b/docstore/628490eb-bb69-41cc-9908-c5ea12d16e3d @@ -0,0 +1 @@ +Function calling with the Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Function calling with the Gemini API Function calling lets you connect models to external tools and APIs. Instead of generating text responses, the model determines when to call specific functions and provides the necessary parameters to execute real-world actions. This allows the model to act as a bridge between natural language and real-world actions and data. Function calling has 3 primary use cases: Augment Knowledge: Access information from external sources like databases, APIs, and knowledge bases. Extend Capabilities: Use external tools to perform computations and extend the limitations of the model, such as using a calculator or creating charts. Take Actions: Interact with external systems using APIs, such as scheduling appointments, creating invoices, sending emails, or controlling smart home devices. Get Weather Schedule Meeting Create Chart How function calling works Function calling involves a structured interaction between your application, the model, and external functions. Here's a breakdown of the process: Define Function Declaration: Define the function declaration in your application code. Function Declarations describe the function's name, parameters, and purpose to the model. Call LLM with function declarations: Send user prompt along with the function declaration(s) to the model. It analyzes the request and determines if a function call would be helpful. If so, it responds with a structured JSON object. Execute Function Code (Your Responsibility): The Model does not execute the function itself. It's your application's responsibility to \ No newline at end of file diff --git a/docstore/62d1e642-104d-4425-bb2f-bb6fdfdf11ed b/docstore/62d1e642-104d-4425-bb2f-bb6fdfdf11ed new file mode 100644 index 0000000000000000000000000000000000000000..a5a6b39c097d8cbbd04646d1c0a8361a10d2c9ae --- /dev/null +++ b/docstore/62d1e642-104d-4425-bb2f-bb6fdfdf11ed @@ -0,0 +1 @@ +meanings as statements, which means that a RAG system won't automatically recognize their relation. Task types enable you to generate optimized embeddings for specific tasks, saving you time and cost and improving performance. Python from google import genai from google.genai import types client = genai . Client () result = client . models . embed_content ( model = "gemini-embedding-exp-03-07" , contents = "What is the meaning of life?" , config = types . EmbedContentConfig ( task_type = "SEMANTIC_SIMILARITY" ) ) print ( result . embeddings ) JavaScript import { GoogleGenAI } from "@google/genai" ; async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . embedContent ({ model : 'gemini-embedding-exp-03-07' , contents : 'What is the meaning of life?' , config : { taskType : "SEMANTIC_SIMILARITY" , } }); console . log ( response . embeddings ); } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-embedding-exp-03-07:embedContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{"model": "models/gemini-embedding-exp-03-07", "content": { "parts":[{ "text": "What is the meaning of life?"}]}, "taskType": "SEMANTIC_SIMILARITY" }' Supported task types Task type Description SEMANTIC_SIMILARITY Used to generate embeddings that are optimized to assess text similarity. CLASSIFICATION Used to generate embeddings that are optimized to classify texts according to preset labels. CLUSTERING Used to generate embeddings that are optimized to cluster texts based on their similarities. RETRIEVAL_DOCUMENT , RETRIEVAL_QUERY , QUESTION_ANSWERING , and FACT_VERIFICATION Used to generate embeddings that are optimized for document search or information retrieval. CODE_RETRIEVAL_QUERY Used to retrieve a code block based on a natural language query, such as sort an array or reverse a linked list. Embeddings of the code blocks are computed using RETRIEVAL_DOCUMENT . Use cases Text embeddings \ No newline at end of file diff --git a/docstore/62d452da-d809-459c-a2dd-70472b58bb9b b/docstore/62d452da-d809-459c-a2dd-70472b58bb9b new file mode 100644 index 0000000000000000000000000000000000000000..65512c01f777d6e5e3b37a55514cc97b01015837 --- /dev/null +++ b/docstore/62d452da-d809-459c-a2dd-70472b58bb9b @@ -0,0 +1 @@ +Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Affective dialog This feature lets Gemini adapt its response style to the input expression and tone. To use affective dialog, set the api version to v1alpha and set enable_affective_dialog to true in the setup message: Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], enable_affective_dialog = True ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], enableAffectiveDialog : true }; Note that affective dialog is currently only supported by the native audio output models. Proactive audio When this feature is enabled, Gemini can proactively decide not to respond if the content is not relevant. To use it, set the api version to v1alpha and configure the proactivity field in the setup message and set proactive_audio to true : Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], proactivity = { 'proactive_audio' : True } ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], proactivity : { proactiveAudio : true } } Note that proactive audio is currently only supported by the native audio output models. Native audio output with thinking Native audio output supports thinking capabilities , available via a separate model gemini-2.5-flash-exp-native-audio-thinking-dialog . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-exp-native-audio-thinking-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio \ No newline at end of file diff --git a/docstore/62f77c39-efbd-441f-8fbf-85265771fa1a b/docstore/62f77c39-efbd-441f-8fbf-85265771fa1a new file mode 100644 index 0000000000000000000000000000000000000000..84fe47fad9b0a25663e2cf29b8b19192b98d3175 --- /dev/null +++ b/docstore/62f77c39-efbd-441f-8fbf-85265771fa1a @@ -0,0 +1 @@ +Available regions for Google AI Studio and Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Available regions for Google AI Studio and Gemini API If you reached this page after trying to open Google AI Studio , it may be because Google AI Studio is not available in your region, or you don't meet the age requirements (18+) for access. You can learn more about the available regions in the following section and other requirements in the terms of service . Available regions Note: For Colab users - Region restrictions are applied based on the region that the Colab instance is in, not the region that the user is in. You can check the location of the Colab instance using !curl ipinfo.io The Gemini API and Google AI Studio are available in the following countries and territories. If you're not in one of these countries or territories, try the Gemini API in Vertex AI : Albania Algeria American Samoa Angola Anguilla Antarctica Antigua and Barbuda Argentina Armenia Aruba Australia Austria Azerbaijan The Bahamas Bahrain Bangladesh Barbados Belgium Belize Benin Bermuda Bhutan Bolivia Bosnia Botswana Brazil British Indian Ocean Territory British Virgin Islands Brunei Bulgaria Burkina Faso Burundi Cabo Verde Cambodia Cameroon Canada Caribbean Netherlands Cayman Islands Central African Republic Chad Chile Christmas Island Cocos (Keeling) Islands Colombia Comoros Cook Islands Côte d'Ivoire Costa Rica Croatia Curaçao Czech Republic Democratic Republic of the Congo Denmark Djibouti Dominica Dominican Republic Ecuador Egypt El Salvador Equatorial Guinea Eritrea Estonia Eswatini Ethiopia Falkland Islands (Islas Malvinas) Faroe Islands Fiji Finland France \ No newline at end of file diff --git a/docstore/633b7503-f4c7-4c4c-9950-f37a683a3a7e b/docstore/633b7503-f4c7-4c4c-9950-f37a683a3a7e new file mode 100644 index 0000000000000000000000000000000000000000..7e8ad5dab4abf30afa5f2a7a1bb2bb58fcb2f5d0 --- /dev/null +++ b/docstore/633b7503-f4c7-4c4c-9950-f37a683a3a7e @@ -0,0 +1 @@ +-X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"' " $MIME_TYPE " '", "data": "' " $IMAGE_B64 " '" } }, {"text": "Caption this image."} ] }] }' 2 > /dev/null Note: Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly. Uploading images using the File API For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to generateContent . See the Files API guide for more information and examples. Python from google import genai client = genai . Client () my_file = client . files . upload ( file = "path/to/sample.jpg" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ my_file , "Caption this image." ], ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.jpg" , config : { mimeType : "image/jpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Caption this image." , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.jpg" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Caption this image." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * \ No newline at end of file diff --git a/docstore/633c884a-5b30-46de-aa6e-bed51bec3d07 b/docstore/633c884a-5b30-46de-aa6e-bed51bec3d07 new file mode 100644 index 0000000000000000000000000000000000000000..10818600c4a983fe2afdf5abfcdbca758413b32e --- /dev/null +++ b/docstore/633c884a-5b30-46de-aa6e-bed51bec3d07 @@ -0,0 +1 @@ +the files the user uploads and pay to store them on a per hour basis. The input / output cost per request with Gemini Flash for example is ~4x less than the standard input / output cost, so if the user chats with their data enough, it becomes a huge cost saving for you as the developer. Long context limitations In various sections of this guide, we talked about how Gemini models achieve high performance across various needle-in-a-haystack retrieval evals. These tests consider the most basic setup, where you have a single needle you are looking for. In cases where you might have multiple "needles" or specific pieces of information you are looking for, the model does not perform with the same accuracy. Performance can vary to a wide degree depending on the context. This is important to consider as there is an inherent tradeoff between getting the right information retrieved and cost. You can get ~99% on a single query, but you have to pay the input token cost every time you send that query. So for 100 pieces of information to be retrieved, if you needed 99% performance, you would likely need to send 100 requests. This is a good example of where context caching can significantly reduce the cost associated with using Gemini models while keeping the performance high. FAQs Where is the best place to put my query in the context window? In most cases, especially if the total context is long, the model's performance will be better if you put your query / question at the end of the prompt (after all the other context). Do I lose model performance when I add more tokens to a query? Generally, if you don't need tokens to be passed to the model, it is best to avoid passing them. However, if you have a large chunk of tokens with some information and want to ask questions about that information, the model is highly capable of extracting that information (up to 99% accuracy in many cases). How can I lower my cost with long-context queries? If you have a similar set of tokens / \ No newline at end of file diff --git a/docstore/633f3155-530a-492e-84fe-93e5c068d401 b/docstore/633f3155-530a-492e-84fe-93e5c068d401 new file mode 100644 index 0000000000000000000000000000000000000000..ca0d44e9961a5c85cb8aeb7b443a141c9784fb0d --- /dev/null +++ b/docstore/633f3155-530a-492e-84fe-93e5c068d401 @@ -0,0 +1 @@ +Image understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image understanding Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models. Tip: In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer improved accuracy for specific use cases like object detection and segmentation , through additional training. See the Capabilities section for more details. Passing images to Gemini You can provide images as input to Gemini using two methods: Passing inline image data : Ideal for smaller files (total request size less than 20MB, including prompts). Uploading images using the File API : Recommended for larger files or for reusing images across multiple requests. Passing inline image data You can pass inline image data in the request to generateContent . You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language). The following example shows how to read an image from a local file and pass it to generateContent API for processing. Python from google.genai import types with open ( 'path/to/small-sample.jpg' , 'rb' ) as f : image_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ types . Part . from_bytes ( data = image_bytes , mime_type = 'image/jpeg' , ), 'Caption this image.' ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs \ No newline at end of file diff --git a/docstore/634b1e44-249c-468a-83a4-f09751c3ba96 b/docstore/634b1e44-249c-468a-83a4-f09751c3ba96 new file mode 100644 index 0000000000000000000000000000000000000000..8b1657bd0e8a588bad85661db832fbb0d660fdd6 --- /dev/null +++ b/docstore/634b1e44-249c-468a-83a4-f09751c3ba96 @@ -0,0 +1 @@ +serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Incremental content updates Use incremental updates to send text input, establish session context, or restore session context. For short contexts you can send turn-by-turn interactions to represent the exact sequence of events: Python turns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }]}, { "role" : "model" , "parts" : [{ "text" : "Paris" }]}, ] await session . send_client_content ( turns = turns , turn_complete = False ) turns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }]}] await session . send_client_content ( turns = turns , turn_complete = True ) JavaScript let inputTurns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }] }, { "role" : "model" , "parts" : [{ "text" : "Paris" }] }, ] session . sendClientContent ({ turns : inputTurns , turnComplete : false }) inputTurns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }] }] session . sendClientContent ({ turns : inputTurns , turnComplete : true }) For longer contexts \ No newline at end of file diff --git a/docstore/634b8209-225f-49c2-a60b-63ac29e586d7 b/docstore/634b8209-225f-49c2-a60b-63ac29e586d7 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/634b8209-225f-49c2-a60b-63ac29e586d7 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/635ce5fd-49b5-44c9-9aa9-f2187edc37b0 b/docstore/635ce5fd-49b5-44c9-9aa9-f2187edc37b0 new file mode 100644 index 0000000000000000000000000000000000000000..df0e1f9fb0a005441553bb4be8975eaf5201dec5 --- /dev/null +++ b/docstore/635ce5fd-49b5-44c9-9aa9-f2187edc37b0 @@ -0,0 +1 @@ +Document understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Document understanding Gemini models can process documents in PDF format, using native vision to understand entire document contexts. This goes beyond simple text extraction, allowing Gemini to: Analyze and interpret content, including text, images, diagrams, charts, and tables, even in long documents up to 1000 pages. Extract information into structured output formats. Summarize and answer questions based on both the visual and textual elements in a document. Transcribe document content (e.g. to HTML), preserving layouts and formatting, for use in downstream applications. Passing inline PDF data You can pass inline PDF data in the request to generateContent . For PDF payloads under 20MB, you can choose between uploading base64 encoded documents or directly uploading locally stored files. The following example shows you how to fetch a PDF from a URL and convert it to bytes for processing: Python from google import genai from google.genai import types import httpx client = genai . Client () doc_url = "https://discovery.ucl.ac.uk/id/eprint/10089234/1/343019_3_art_0_py4t4l_convrt.pdf" # Retrieve and encode the PDF byte doc_data = httpx . get ( doc_url ) . content prompt = "Summarize this document" response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ types . Part . from_bytes ( data = doc_data , mime_type = 'application/pdf' , ), prompt ]) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); async function main () { const pdfResp = await fetch \ No newline at end of file diff --git a/docstore/63667cb4-d283-40ea-a6fe-1ed8f4419478 b/docstore/63667cb4-d283-40ea-a6fe-1ed8f4419478 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/63667cb4-d283-40ea-a6fe-1ed8f4419478 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/6366c34d-a55b-4bea-93b9-8a7ff7f2c9cb b/docstore/6366c34d-a55b-4bea-93b9-8a7ff7f2c9cb new file mode 100644 index 0000000000000000000000000000000000000000..72fc1af4a198f3f6ddd1bd2823f7a9c0e142bbbb --- /dev/null +++ b/docstore/6366c34d-a55b-4bea-93b9-8a7ff7f2c9cb @@ -0,0 +1 @@ +should be sent to flush any cached audio. The client can resume sending audio data at any time. Python # example audio file to try: # URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" # !wget -q $URL -O sample.pcm import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_bytes = Path ( "sample.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) # if stream gets paused, send: # await session.send_realtime_input(audio_stream_end=True) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // example audio file to try: // URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" // !wget -q $URL -O sample.pcm import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . \ No newline at end of file diff --git a/docstore/639e82ac-345a-45c6-ad07-1df11f13d24c b/docstore/639e82ac-345a-45c6-ad07-1df11f13d24c new file mode 100644 index 0000000000000000000000000000000000000000..1b590529498df147a73d1ae4a22f439d08a36cb6 --- /dev/null +++ b/docstore/639e82ac-345a-45c6-ad07-1df11f13d24c @@ -0,0 +1 @@ +from "node:fs" ; const ai = new GoogleGenAI ({}); const base64ImageFile = fs . readFileSync ( "path/to/small-sample.jpg" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "image/jpeg" , data : base64ImageFile , }, }, { text : "Caption this image." }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go bytes , _ := os . ReadFile ( "path/to/small-sample.jpg" ) parts := [] * genai . Part { genai . NewPartFromBytes ( bytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST IMG_PATH = "/path/to/your/image1.jpg" if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "' " $( base64 $B64FLAGS $IMG_PATH ) " '" } }, {"text": "Caption this image."}, ] }] }' 2 > /dev/null You can also fetch an image from a URL, convert it to bytes, and pass it to generateContent as shown in the following examples. Python from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests . get ( image_path ) . content image = types . Part . from_bytes ( data = image_bytes , mime_type = "image/jpeg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is this image?" , image ], ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; \ No newline at end of file diff --git a/docstore/63a907db-b543-4d93-b019-227b61d516a8 b/docstore/63a907db-b543-4d93-b019-227b61d516a8 new file mode 100644 index 0000000000000000000000000000000000000000..8a34a1fe66a041005f53a5e081e09b0fa5f13242 --- /dev/null +++ b/docstore/63a907db-b543-4d93-b019-227b61d516a8 @@ -0,0 +1 @@ +Grounding with Google Search | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Grounding with Google Search Grounding with Google Search connects the Gemini model to real-time web content and works with all available languages . This allows Gemini to provide more accurate answers and cite verifiable sources beyond its knowledge cutoff. Grounding helps you build applications that can: Increase factual accuracy: Reduce model hallucinations by basing responses on real-world information. Access real-time information: Answer questions about recent events and topics. Provide citations: Build user trust by showing the sources for the model's claims. Python from google import genai from google.genai import types # Configure the client client = genai . Client () # Define the grounding tool grounding_tool = types . Tool ( google_search = types . GoogleSearch () ) # Configure generation settings config = types . GenerateContentConfig ( tools = [ grounding_tool ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Who won the euro 2024?" , config = config , ) # Print the grounded response print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Define the grounding tool const groundingTool = { googleSearch : {}, }; // Configure generation settings const config = { tools : [ groundingTool ], }; // Make the request const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Who won the euro 2024?" , config , }); // Print the grounded response console . log ( response . \ No newline at end of file diff --git a/docstore/63af7a36-7410-4c35-ac69-a7bdbbda811b b/docstore/63af7a36-7410-4c35-ac69-a7bdbbda811b new file mode 100644 index 0000000000000000000000000000000000000000..ebc9c731f8f83925d47212f648cd7b6ffe1e65ff --- /dev/null +++ b/docstore/63af7a36-7410-4c35-ac69-a7bdbbda811b @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-2.5-pro-preview-tts Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/63b16469-6fe2-4ca0-adb6-6ad3eaed9c13 b/docstore/63b16469-6fe2-4ca0-adb6-6ad3eaed9c13 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/63b16469-6fe2-4ca0-adb6-6ad3eaed9c13 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/63cf3ecf-dfa4-4bec-b2bb-46286e622351 b/docstore/63cf3ecf-dfa4-4bec-b2bb-46286e622351 new file mode 100644 index 0000000000000000000000000000000000000000..82539837fcb7adc353717f66580809eb160e30f0 --- /dev/null +++ b/docstore/63cf3ecf-dfa4-4bec-b2bb-46286e622351 @@ -0,0 +1 @@ +Google stock price?" , tools = 'google_search_retrieval' ) After Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'What is the Google stock price?' , config = types . GenerateContentConfig ( tools = [ types . Tool ( google_search = types . GoogleSearch () ) ] ) ) JSON response Generate answers in JSON format. Before Python By specifying a response_schema and setting response_mime_type="application/json" users can constrain the model to produce a JSON response following a given structure. import google.generativeai as genai import typing_extensions as typing class CountryInfo ( typing . TypedDict ): name : str population : int capital : str continent : str major_cities : list [ str ] gdp : int official_language : str total_area_sq_mi : int model = genai . GenerativeModel ( model_name = "gemini-1.5-flash" ) result = model . generate_content ( "Give me information of the United States" , generation_config = genai . GenerationConfig ( response_mime_type = "application/json" , response_schema = CountryInfo ), ) JavaScript import { GoogleGenerativeAI , SchemaType } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const schema = { description : "List of recipes" , type : SchemaType . ARRAY , items : { type : SchemaType . OBJECT , properties : { recipeName : { type : SchemaType . STRING , description : "Name of the recipe" , nullable : false , }, }, required : [ "recipeName" ], }, }; const model = genAI . getGenerativeModel ({ model : "gemini-1.5-pro" , generationConfig : { responseMimeType : "application/json" , responseSchema : schema , }, }); const result = await model . generateContent ( "List a few popular cookie recipes." , ); console . log ( result . response . text ()); After Python The new SDK uses pydantic classes to provide the schema (although you can pass a genai.types.Schema , or equivalent \ No newline at end of file diff --git a/docstore/63e1c6c3-199e-4295-aa62-632d38ba6227 b/docstore/63e1c6c3-199e-4295-aa62-632d38ba6227 new file mode 100644 index 0000000000000000000000000000000000000000..568ed74dcb0eadd29fa79830f3f9f68ba7bde9f4 --- /dev/null +++ b/docstore/63e1c6c3-199e-4295-aa62-632d38ba6227 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/grounding Title: Grounding with Google Search | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/63e2919e-a715-4f75-8364-4403eb390a4c b/docstore/63e2919e-a715-4f75-8364-4403eb390a4c new file mode 100644 index 0000000000000000000000000000000000000000..9b7a72ccaa4be8bd1d0ee10d4c849d278d1b90e5 --- /dev/null +++ b/docstore/63e2919e-a715-4f75-8364-4403eb390a4c @@ -0,0 +1 @@ +the standard rate limits for GenerateContent apply, and token limits include cached tokens. The number of cached tokens is returned in the usage_metadata from the create, get, and list operations of the cache service, and also in GenerateContent when using the cache. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/6417eab5-9340-4ddc-a292-cf99150c38c9 b/docstore/6417eab5-9340-4ddc-a292-cf99150c38c9 new file mode 100644 index 0000000000000000000000000000000000000000..a99dcaa8b13f3b83504ad1b35c796f8fc297615c --- /dev/null +++ b/docstore/6417eab5-9340-4ddc-a292-cf99150c38c9 @@ -0,0 +1 @@ +] # Execute the prompt with specified tools in audio modality await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Multiple tasks example - combining lights, code execution, and search const prompt = ` Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! ` ; const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } // not defined here. ]; // Execute the prompt with specified tools in audio modality await run ( prompt , { tools : tools , modality : "AUDIO" }); Python developers can try this out in the Live API Tool Use notebook . Model context protocol (MCP) Model Context Protocol (MCP) is an open standard for connecting AI applications with external tools and data. MCP provides a common protocol for models to access context, such as functions (tools), data sources (resources), or predefined prompts. The Gemini SDKs have built-in support for the MCP, reducing boilerplate code and offering automatic tool calling for MCP tools. When the model generates an MCP tool call, the Python and JavaScript client SDK can automatically execute the MCP tool and send the response back to the model in a subsequent request, continuing this loop until no more tool calls are made by the model. Here, you can find an example of how to use a local MCP server with Gemini and mcp SDK. Python Make sure the latest version of the mcp SDK is installed on your platform of choice. pip install mcp Note: Python supports automatic tool calling by passing in the ClientSession into the tools parameters. If you want to disable it, you can provide automatic_function_calling with disabled True . import os import asyncio from datetime import datetime from mcp import ClientSession , StdioServerParameters from \ No newline at end of file diff --git a/docstore/641a814a-747e-444d-b184-cdd177fec7c3 b/docstore/641a814a-747e-444d-b184-cdd177fec7c3 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/641a814a-747e-444d-b184-cdd177fec7c3 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/6434eddf-8c1e-4fa1-a76a-8752be97c4e3 b/docstore/6434eddf-8c1e-4fa1-a76a-8752be97c4e3 new file mode 100644 index 0000000000000000000000000000000000000000..750217269b5a3e53dd2d92de822b68418a6799df --- /dev/null +++ b/docstore/6434eddf-8c1e-4fa1-a76a-8752be97c4e3 @@ -0,0 +1 @@ +' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Get metadata for a file You can verify that the API successfully stored the uploaded file and get its metadata by calling files.get . Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) file_name = myfile . name myfile = client . files . get ( name = file_name ) print ( myfile ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; const fetchedFile = await ai . files . get ({ name : fileName }); console . log ( fetchedFile ); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } gotFile , err := client . GetFile ( ctx , file . Name ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Got file:" , gotFile . Name ) REST # file_info.json was created in the upload example name = $( jq ".file.name" file_info.json ) # Get the file of interest to check state curl https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " > file_info.json # Print some information about the file you got name = $( jq ".file.name" file_info.json ) echo name = $name file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri List uploaded files You can upload multiple files using the Files API. The following code gets a list of all the files uploaded: Python print ( 'My files:' ) for f in client . files . list (): print ( ' ' , f . name ) JavaScript const listResponse = await ai . files . list ({ config : { pageSize : 10 } }); for await ( const file of listResponse ) { console . log ( file . name ); } Go iter := client . ListFiles ( ctx ) for { ifile , err := iter . Next () if err == iterator . Done { break } if err != nil { log . Fatal ( err ) } fmt . Println ( ifile . Name ) } REST echo "My files: " curl \ No newline at end of file diff --git a/docstore/644c5176-a049-4c2d-9ee5-546d0ab587cf b/docstore/644c5176-a049-4c2d-9ee5-546d0ab587cf new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/644c5176-a049-4c2d-9ee5-546d0ab587cf @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/64517781-baf5-40ec-abf5-1564ea467526 b/docstore/64517781-baf5-40ec-abf5-1564ea467526 new file mode 100644 index 0000000000000000000000000000000000000000..a70be0d04647e3405f67cb87b3156d8922b7c210 --- /dev/null +++ b/docstore/64517781-baf5-40ec-abf5-1564ea467526 @@ -0,0 +1 @@ +to the examples provides labels that the model can use when generating the output, which makes it easier to parse output content. In the following example, "Text:" is the input prefix and "The answer is:" is the output prefix. Prompt: Classify the text as one of the following categories. - large - small Text: Rhino The answer is: large Text: Mouse The answer is: small Text: Snail The answer is: small Text: Elephant The answer is: Response: The answer is: large (gemini-2.5-flash) Break down prompts into components For use cases that require complex prompts, you can help the model manage this complexity by breaking things down into simpler components. Break down instructions: Instead of having many instructions in one prompt, create one prompt per instruction. You can choose which prompt to process based on the user's input. Chain prompts: For complex tasks that involve multiple sequential steps, make each step a prompt and chain the prompts together in a sequence. In this sequential chain of prompts, the output of one prompt in the sequence becomes the input of the next prompt. The output of the last prompt in the sequence is the final output. Aggregate responses: Aggregation is when you want to perform different parallel tasks on different portions of the data and aggregate the results to produce the final output. For example, you can tell the model to perform one operation on the first part of the data, perform another operation on the rest of the data and aggregate the results. Experiment with model parameters Each call that you send to a model includes parameter values that control how the model generates a response. The model can generate different results for different parameter values. Experiment with different parameter values to get the best values for the task. The parameters available for different models may differ. The most common parameters are the following: Max output tokens: Specifies the maximum number of tokens that can be generated in the \ No newline at end of file diff --git a/docstore/645c9fd5-6b43-4bbc-aa57-70bffcdf6e32 b/docstore/645c9fd5-6b43-4bbc-aa57-70bffcdf6e32 new file mode 100644 index 0000000000000000000000000000000000000000..1426f6277d87da029e324e49b5a4fcb88dde544c --- /dev/null +++ b/docstore/645c9fd5-6b43-4bbc-aa57-70bffcdf6e32 @@ -0,0 +1 @@ +live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Receiving a message before the session disconnects The server sends a GoAway message that signals that the current connection will soon be terminated. This message includes the timeLeft , indicating the remaining time and lets you take further action before the connection will be terminated as ABORTED. Python async for response in session . receive (): if response . go_away is not None : # The connection will soon be terminated print ( response . go_away . time_left ) JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . goAway ) { console . debug ( 'Time left: %s\n' , turn . goAway . timeLeft ); } } Receiving a message when the generation is complete The server sends a generationComplete message that signals that the model finished generating the response. Python async for response in session . receive (): if response . server_content . generation_complete is True : # The generation is complete JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . generationComplete ) { // The generation is complete } } What's next Explore more ways to work with the Live API in the full Capabilities guide, the Tool use page, or the Live API cookbook . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/6463095d-875e-4204-a2ce-049be9646357 b/docstore/6463095d-875e-4204-a2ce-049be9646357 new file mode 100644 index 0000000000000000000000000000000000000000..713dbbbbe2503b08013f57d8bc685db657b79963 --- /dev/null +++ b/docstore/6463095d-875e-4204-a2ce-049be9646357 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs#main-content Title: Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/647363c1-6ba9-4c14-a442-736c44f3c99c b/docstore/647363c1-6ba9-4c14-a442-736c44f3c99c new file mode 100644 index 0000000000000000000000000000000000000000..019b7de7e49d445c43758810d78952e4f88cd47b --- /dev/null +++ b/docstore/647363c1-6ba9-4c14-a442-736c44f3c99c @@ -0,0 +1 @@ +prompt const contents = [ { role : 'user' , parts : [{ text : 'Turn the lights down to a romantic level' }] } ]; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( response . functionCalls [ 0 ]); The model then returns a functionCall object in an OpenAPI compatible schema specifying how to call one or more of the declared functions in order to respond to the user's question. Python id = None args = { 'color_temp' : 'warm' , 'brightness' : 25 } name = 'set_light_values' JavaScript { name : 'set_light_values' , args : { brightness : 25 , color_temp : 'warm' } } Step 3: Execute set_light_values function code Extract the function call details from the model's response, parse the arguments , and execute the set_light_values function. Python # Extract tool call details, it may not be in the first part. tool_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call if tool_call . name == "set_light_values" : result = set_light_values ( ** tool_call . args ) print ( f "Function execution result: { result } " ) JavaScript // Extract tool call details const tool_call = response . functionCalls [ 0 ] let result ; if ( tool_call . name === 'set_light_values' ) { result = setLightValues ( tool_call . args . brightness , tool_call . args . color_temp ); console . log ( `Function execution result: ${ JSON . stringify ( result ) } ` ); } Step 4: Create user friendly response with function result and call the model again Finally, send the result of the function execution back to the model so it can incorporate this information into its final response to the user. Python # Create a function response part function_response_part = types . Part . from_function_response ( name = tool_call . name , response = { "result" : result }, ) # Append function call and result of the function execution to contents contents . append ( response . \ No newline at end of file diff --git a/docstore/6482e983-8d66-48f3-a052-3fe8ffbc7209 b/docstore/6482e983-8d66-48f3-a052-3fe8ffbc7209 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/6482e983-8d66-48f3-a052-3fe8ffbc7209 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/64a43ad6-fcda-4a00-8062-78306783099b b/docstore/64a43ad6-fcda-4a00-8062-78306783099b new file mode 100644 index 0000000000000000000000000000000000000000..bcef42c3270c69579ba66cffffe2395612618a55 --- /dev/null +++ b/docstore/64a43ad6-fcda-4a00-8062-78306783099b @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/live-guide#affective-dialog Title: Live API capabilities guide | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/64bad589-f8df-45d4-bb1c-5cddb408bd34 b/docstore/64bad589-f8df-45d4-bb1c-5cddb408bd34 new file mode 100644 index 0000000000000000000000000000000000000000..6ba79c2c7031e707d89e2b23470a5d9ee375cc5c --- /dev/null +++ b/docstore/64bad589-f8df-45d4-bb1c-5cddb408bd34 @@ -0,0 +1 @@ +mcp.client.stdio import stdio_client from google import genai client = genai . Client () # Create server parameters for stdio connection server_params = StdioServerParameters ( command = "npx" , # Executable args = [ "-y" , "@philschmid/weather-mcp" ], # MCP Server env = None , # Optional environment variables ) async def run (): async with stdio_client ( server_params ) as ( read , write ): async with ClientSession ( read , write ) as session : # Prompt to get the weather for the current day in London. prompt = f "What is the weather in London in { datetime . now () . strftime ( '%Y-%m- %d ' ) } ?" # Initialize the connection between client and server await session . initialize () # Send request to the model with MCP function declarations response = await client . aio . models . generate_content ( model = "gemini-2.5-flash" , contents = prompt , config = genai . types . GenerateContentConfig ( temperature = 0 , tools = [ session ], # uses the session, will automatically call the tool # Uncomment if you **don't** want the SDK to automatically call the tool # automatic_function_calling=genai.types.AutomaticFunctionCallingConfig( # disable=True # ), ), ) print ( response . text ) # Start the asyncio event loop and run the main function asyncio . run ( run ()) JavaScript Make sure the latest version of the mcp SDK is installed on your platform of choice. npm install @modelcontextprotocol/sdk Note: JavaScript supports automatic tool calling by wrapping the client with mcpToTool . If you want to disable it, you can provide automaticFunctionCalling with disabled true . import { GoogleGenAI , FunctionCallingConfigMode , mcpToTool } from '@google/genai' ; import { Client } from "@modelcontextprotocol/sdk/client/index.js" ; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js" ; // Create server parameters for stdio connection const serverParams = new StdioClientTransport ({ command : "npx" , // Executable args : [ "-y" , "@philschmid/weather-mcp" \ No newline at end of file diff --git a/docstore/64c93946-344e-425f-89a2-09a1952dbcfd b/docstore/64c93946-344e-425f-89a2-09a1952dbcfd new file mode 100644 index 0000000000000000000000000000000000000000..4ce25c45c0956235a2a76b8ce578fbaaad6010c8 --- /dev/null +++ b/docstore/64c93946-344e-425f-89a2-09a1952dbcfd @@ -0,0 +1 @@ +clip" , ]), }); console . log ( response . text ); } await main (); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } defer client . DeleteFile ( ctx , file . Name ) model := client . GenerativeModel ( "gemini-2.0-flash" ) resp , err := model . GenerateContent ( ctx , genai . FileData { URI : file . URI }, genai . Text ( "Describe this audio clip" )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) REST AUDIO_PATH = "path/to/sample.mp3" MIME_TYPE = $( file -b --mime-type " ${ AUDIO_PATH } " ) NUM_BYTES = $( wc -c < " ${ AUDIO_PATH } " ) DISPLAY_NAME = AUDIO tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D " ${ tmp_header_file } " \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ AUDIO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Describe this audio clip"}, {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": \ No newline at end of file diff --git a/docstore/64ca695e-f9dd-4f85-9505-f844f0efcc6e b/docstore/64ca695e-f9dd-4f85-9505-f844f0efcc6e new file mode 100644 index 0000000000000000000000000000000000000000..d73a03ac64bf52901f07bf0a8fe4fc21e47f6048 --- /dev/null +++ b/docstore/64ca695e-f9dd-4f85-9505-f844f0efcc6e @@ -0,0 +1 @@ +are used in a variety of common AI use cases, such as: Information retrieval: You can use embeddings to retrieve semantically similar text given a piece of input text. Document search tutorial task Clustering: Comparing groups of embeddings can help identify hidden trends. Embedding clustering tutorial bubble_chart Vector database: As you take different embedding use cases to production, it is common to store embeddings in a vector database. Vector database tutorial bolt Classification: You can train a model using embeddings to classify documents into categories. Classification tutorial token Embedding models The Gemini API offers three models that generate text embeddings: gemini-embedding-exp-03-07 text-embedding-004 embedding-001 What's next Check out the embeddings quickstart notebook . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/64ccff89-8f53-478d-8096-c6ceb01299fe b/docstore/64ccff89-8f53-478d-8096-c6ceb01299fe new file mode 100644 index 0000000000000000000000000000000000000000..a93f111d87cca3d0226fc4cdf96775703aa03950 --- /dev/null +++ b/docstore/64ccff89-8f53-478d-8096-c6ceb01299fe @@ -0,0 +1 @@ +genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMAGE_PATH = "path/to/sample.jpg" MIME_TYPE = $( file -b --mime-type " ${ IMAGE_PATH } " ) NUM_BYTES = $( wc -c < " ${ IMAGE_PATH } " ) DISPLAY_NAME = IMAGE tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Caption this image."}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Prompting with multiple images You can provide multiple images in a \ No newline at end of file diff --git a/docstore/64e81080-6b5e-4f50-bd74-42678915489a b/docstore/64e81080-6b5e-4f50-bd74-42678915489a new file mode 100644 index 0000000000000000000000000000000000000000..1d5a02022906f295c3ad625acee2d3f5c63827ae --- /dev/null +++ b/docstore/64e81080-6b5e-4f50-bd74-42678915489a @@ -0,0 +1 @@ +Aoede -- Breezy Callirrhoe -- Easy-going Autonoe -- Bright Enceladus -- Breathy Iapetus -- Clear Umbriel -- Easy-going Algieba -- Smooth Despina -- Smooth Erinome -- Clear Algenib -- Gravelly Rasalgethi -- Informative Laomedeia -- Upbeat Achernar -- Soft Alnilam -- Firm Schedar -- Even Gacrux -- Mature Pulcherrima -- Forward Achird -- Friendly Zubenelgenubi -- Casual Vindemiatrix -- Gentle Sadachbia -- Lively Sadaltager -- Knowledgeable Sulafat -- Warm You can hear all the voice options in AI Studio . Supported languages The TTS models detect the input language automatically. They support the following 24 languages: Language BCP-47 Code Language BCP-47 Code Arabic (Egyptian) ar-EG German (Germany) de-DE English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Korean (Korea) ko-KR Portuguese (Brazil) pt-BR Russian (Russia) ru-RU Dutch (Netherlands) nl-NL Polish (Poland) pl-PL Thai (Thailand) th-TH Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Romanian (Romania) ro-RO Ukrainian (Ukraine) uk-UA Bengali (Bangladesh) bn-BD English (India) en-IN & hi-IN bundle Marathi (India) mr-IN Tamil (India) ta-IN Telugu (India) te-IN Supported models Model Single speaker Multispeaker Gemini 2.5 Flash Preview TTS ✔️ ✔️ Gemini 2.5 Pro Preview TTS ✔️ ✔️ Limitations TTS models can only receive text inputs and generate audio outputs. A TTS session has a context window limit of 32k tokens. Review Languages section for language support. What's next Try the audio generation cookbook . Gemini's Live API offers interactive audio generation options you can interleave with other modalities. For working with audio inputs , visit the Audio understanding guide. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site \ No newline at end of file diff --git a/docstore/650883ee-cabf-46b7-9a3d-40564fc1a18f b/docstore/650883ee-cabf-46b7-9a3d-40564fc1a18f new file mode 100644 index 0000000000000000000000000000000000000000..4a588ba82271aaa08f2df12295049c23bd57f7eb --- /dev/null +++ b/docstore/650883ee-cabf-46b7-9a3d-40564fc1a18f @@ -0,0 +1 @@ +, ) print ( response . text ) # The SDK handles the function call and returns the final text You can disable automatic function calling with: Python config = types . GenerateContentConfig ( tools = [ get_current_temperature ], automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ) ) Automatic function schema declaration Automatic schema extraction from Python functions doesn't work in all cases. For example, it doesn't handle cases where you describe the fields of a nested dictionary-object. The API is able to describe any of the following types: Python AllowedType = ( int | float | bool | str | list [ 'AllowedType' ] | dict [ str , AllowedType ]) To see what the inferred schema looks like, you can convert it using from_callable : Python def multiply ( a : float , b : float ): """Returns a * b.""" return a * b fn_decl = types . FunctionDeclaration . from_callable ( callable = multiply , client = client ) # to_json_dict() provides a clean JSON representation. print ( fn_decl . to_json_dict ()) Multi-tool use: Combine native tools with function calling You can enable multiple tools combining native tools with function calling at the same time. Here's an example that enables two tools, Grounding with Google Search and code execution , in a request using the Live API . Note: Multi-tool use is a- Live API only feature at the moment. The run() function declaration, which handles the asynchronous websocket setup, is omitted for brevity. Python # Multiple tasks example - combining lights, code execution, and search prompt = """ Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! """ tools = [ { 'google_search' : {}}, { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} # not defined here. \ No newline at end of file diff --git a/docstore/6522c605-a16d-4513-883c-41acf5f45014 b/docstore/6522c605-a16d-4513-883c-41acf5f45014 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/6522c605-a16d-4513-883c-41acf5f45014 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/652b6bce-2e88-4e56-bb1b-0fa40e52c8bd b/docstore/652b6bce-2e88-4e56-bb1b-0fa40e52c8bd new file mode 100644 index 0000000000000000000000000000000000000000..b0d24ed8267a7db2d3f856003571a245204928ff --- /dev/null +++ b/docstore/652b6bce-2e88-4e56-bb1b-0fa40e52c8bd @@ -0,0 +1 @@ +voice name from the prebuilt output voices . This example saves the output audio from the model in a wave file: Python from google import genai from google.genai import types import wave # Set up the wave file to save the output: def wave_file ( filename , pcm , channels = 1 , rate = 24000 , sample_width = 2 ): with wave . open ( filename , "wb" ) as wf : wf . setnchannels ( channels ) wf . setsampwidth ( sample_width ) wf . setframerate ( rate ) wf . writeframes ( pcm ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = "Say cheerfully: Have a wonderful day!" , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), ) ) data = response . candidates [ 0 ] . content . parts [ 0 ] . inline_data . data file_name = 'out.wav' wave_file ( file_name , data ) # Saves the file to current directory For more code samples, refer to the "TTS - Get Started" file in the cookbooks repository: View on GitHub JavaScript import { GoogleGenAI } from '@google/genai' ; import wav from 'wav' ; async function saveWaveFile ( filename , pcmData , channels = 1 , rate = 24000 , sampleWidth = 2 , ) { return new Promise (( resolve , reject ) = > { const writer = new wav . FileWriter ( filename , { channels , sampleRate : rate , bitDepth : sampleWidth * 8 , }); writer . on ( 'finish' , resolve ); writer . on ( 'error' , reject ); writer . write ( pcmData ); writer . end (); }); } async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : [{ parts : [{ text : 'Say cheerfully: Have a wonderful day!' }] }], config : { responseModalities : [ 'AUDIO' ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Kore' }, }, }, }, }); \ No newline at end of file diff --git a/docstore/65372eff-29c3-4828-b558-e583a921aa78 b/docstore/65372eff-29c3-4828-b558-e583a921aa78 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/65372eff-29c3-4828-b558-e583a921aa78 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/6539c674-6e0d-48df-8acb-461405731647 b/docstore/6539c674-6e0d-48df-8acb-461405731647 new file mode 100644 index 0000000000000000000000000000000000000000..6753b1d43ae21393f7ea777aad5afc4becddf540 --- /dev/null +++ b/docstore/6539c674-6e0d-48df-8acb-461405731647 @@ -0,0 +1 @@ +response will include a thought_signature field containing an encrypted representation of the model's reasoning. Return the signature: When you send the function's execution result back to the server, include the thought_signature you received. This allows the model to restore its previous thinking context and will likely result in better function calling performance. Receiving signatures from the server Signatures are returned in the part after the model's thinking phase, which typically is a text or function call. Here are some examples of what thought signatures look like returned in each type of part, in response to the request "What's the weather in Lake Tahoe?" using the Get Weather example: Text part [{ "candidates" : [ { "content" : { "parts" : [ { "text" : "Here's what the weather in Lake Tahoe is today" , "thoughtSignature" : "ClcBVKhc7ru7KzUI7SrdUoIdAYLm/+i93aHjfIt4xHyAoO/G70tApxnK2ujBhOhC1PrRy1pkQa88fqFvpHNVd1HDjNLO7mkp6/hFwE+SPPEB3fh0hs4oM8MKhgIBVKhc7uIGvrS7i/T4HpfbnYrluFfWNjZ62gewqe4cVdR/Dlh+zbjtYmDD0gPZ+SuBO7vvHQdzsjePRP+2Y5XddX6LEf/cGGgakq8EhVvw/a6IVzUO6XmpHg2Ag1sl8E9+VFH/lC0R0ZuYdFWligtDuYwp5p5q3o59G0TtWeU2MC1y2MJfE9u/KWd313ldka80/X2W/xF2O/4djMp5G2WKcULfve75zeRCy0mc5iS3SB9mTH0cT6x0vtKjeBx50gcg+CQWtJcRuwTVzz54dmvmK9xvnqA8gKGw3DuaM9wfy5hyY7Qg0z3iyyWdP8T/lbjKim8IEQOk7O1vVwP1Ko7oMYH8JgA1CsoBAVSoXO6v4c5RSyd1cn6EIU0pEFQsjW7rYWPuZdOFq/tsGJT9BCfW7KGkPGwlNSq8jTJFvbcJ/DjtndISQYXwiXd2kGa5JfdS2Kh4zOxCxiWtOk+2nCc3+XQk2nonhO+esGJpkDdbbHZSqRgcUtYKq7q28iPFOQvOFyCiZNB7K86Z/6Hnagu2snSlN/BcTMaFGaWpcCClSUo4foRZn3WbNCoM8rcpD7qEJMp4a5baaSxyyeL1ZTGd2HLpFys/oiW6e3oAnhxuIysCwg==" } ] , "role" : "model" } , "index" : 0 } ] , # Remainder of response... Function call part [{ "candidates" : [ { "content" : { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/6542f12e-2924-416a-b291-3aee6d567d8c b/docstore/6542f12e-2924-416a-b291-3aee6d567d8c new file mode 100644 index 0000000000000000000000000000000000000000..6acfe7ba02c229c83724a8d4179d85fae6727e02 --- /dev/null +++ b/docstore/6542f12e-2924-416a-b291-3aee6d567d8c @@ -0,0 +1 @@ +images before performing its reasoning task. Prompt Model response What is in common between these images? All four images are examples of landscape photography. Updated prompt Improved response First, describe what's in each image in detail. What's in common between these images? The first image shows a person packing a suitcase. The second image shows an empty hallway in a shopping mall. The third image shows a roll of toilet paper. The fourth image shows a dark and stormy sea. The commonality between these images is that they are all examples of liminal spaces. Liminal spaces are places or situations that are in between two other places or situations. They are often characterized by a sense of uncertainty or ambiguity. The first image shows the person in between being at home and being on a trip. The second image shows the hallway in between two stores. The third image is of toilet paper, which can represent a bathroom. A bathroom is a transitional space between private and public realms. It's a place where we prepare ourselves to enter or leave the outside world. The fourth image shows the sea in between being calm and being stormy. Liminal spaces can be both exciting and unsettling. They can be a place of new beginning or a place of endings. They can be a place of reflection or a place of action. You can also try explicitly asking the model to refer to what's in the image. Updated prompt Improved response What is in common between these images? Refer to what's in the images in your response. All of the images are related to travel. The first image shows a person packing a suitcase, the second image shows an empty hallway in a shopping mall, the third image shows a stack of toilet paper, and the fourth image shows a dark and stormy sea. These images are all related to travel because they are all things that people need or use when they travel. The first image shows a person packing a suitcase, which is something that people do when they are going on a trip. The \ No newline at end of file diff --git a/docstore/654b91ec-056f-458a-9841-1b065ec9cf45 b/docstore/654b91ec-056f-458a-9841-1b065ec9cf45 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/654b91ec-056f-458a-9841-1b065ec9cf45 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/65613d11-ffb0-4b33-ac7b-a0c03fd4b54c b/docstore/65613d11-ffb0-4b33-ac7b-a0c03fd4b54c new file mode 100644 index 0000000000000000000000000000000000000000..7645b864913317d4ec923e00d51796055880e22d --- /dev/null +++ b/docstore/65613d11-ffb0-4b33-ac7b-a0c03fd4b54c @@ -0,0 +1 @@ +https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:batchGenerateContent \ -X POST \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type:application/json" \ -d "{ 'batch': { 'display_name': 'my-batch-requests', 'input_config': { 'requests': { 'file_name': ${ BATCH_INPUT_FILE } } } } }" When you create a batch job, you will get a job name returned. Use this name for monitoring the job status as well as retrieving the results once the job completes. The following is an example output that contains a job name: Created batch job from file: batches/123456789 Monitoring job status Use the operation name obtained when creating the batch job to poll its status. The state field of the batch job will indicate its current status. A batch job can be in one of the following states: JOB_STATE_PENDING : The job has been created and is waiting to be processed by the service. JOB_STATE_SUCCEEDED : The job completed successfully. You can now retrieve the results. JOB_STATE_FAILED : The job failed. Check the error details for more information. JOB_STATE_CANCELLED : The job was cancelled by the user. You can poll the job status periodically to check for completion. Python # Use the name of the job you want to check # e.g., inline_batch_job.name from the previous step job_name = "YOUR_BATCH_JOB_NAME" # (e.g. 'batches/your-batch-id') batch_job = client . batches . get ( name = job_name ) completed_states = set ([ 'JOB_STATE_SUCCEEDED' , 'JOB_STATE_FAILED' , 'JOB_STATE_CANCELLED' , ]) print ( f "Polling status for job: { job_name } " ) batch_job = client . batches . get ( name = job_name ) # Initial get while batch_job . state . name not in completed_states : print ( f "Current state: { batch_job . state . name } " ) time . sleep ( 30 ) # Wait for 30 seconds before polling again batch_job = client . batches . get ( name = job_name ) print ( f "Job finished with state: { batch_job . state . name } " ) if batch_job . state . name == 'JOB_STATE_FAILED' : print ( f \ No newline at end of file diff --git a/docstore/6582449b-dcb7-45ac-87cb-955435190fa8 b/docstore/6582449b-dcb7-45ac-87cb-955435190fa8 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/6582449b-dcb7-45ac-87cb-955435190fa8 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/6588f26a-7dcc-4dbb-8673-6bee75553b2a b/docstore/6588f26a-7dcc-4dbb-8673-6bee75553b2a new file mode 100644 index 0000000000000000000000000000000000000000..a76efec9a9a3e7390e77e9a866cc227646391c5b --- /dev/null +++ b/docstore/6588f26a-7dcc-4dbb-8673-6bee75553b2a @@ -0,0 +1 @@ +Billing | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Billing This guide provides an overview of different Gemini API billing options, explains how to enable billing and monitor usage, and provides answers to frequently asked questions (FAQs) about billing. Upgrade to the Gemini API paid tier About billing Billing for the Gemini API is based on two pricing tiers: free of charge (or free ) and pay-as-you-go (or paid ). Pricing and rate limits differ between these tiers and also vary by model. You can check out the rate limits and pricing pages for more into. For a model-by-model breakdown of capabilities, see the Gemini models page . How to request an upgrade To transition from the free tier to the pay-as-you-go plan, you need to enable billing for your Google Cloud project. The button you see in Google AI Studio depends on your project's current plan. If you're on the free tier, you'll see a Set up Billing button for your project. If you're already on the paid tier and meet the criteria for a plan change, you might see an Upgrade button. To start the process, follow these steps: Go to the AI Studio API keys page . Find the project you want to move to the paid plan and click either Set up Billing or Upgrade , depending on the button displayed. The next step depends on the button you clicked: If you clicked Set up Billing: You'll be redirected to the Google Cloud console to link a billing account to your project. Follow the on-screen instructions to complete the process. If you clicked Upgrade: The system will automatically verify your project's eligibility. If your project meets all the requirements, it will be instantly upgraded to \ No newline at end of file diff --git a/docstore/65967db6-78fe-4638-a266-d888d831c0b5 b/docstore/65967db6-78fe-4638-a266-d888d831c0b5 new file mode 100644 index 0000000000000000000000000000000000000000..b73659061f0ce2830a1e6cf67f6a74b5cc699bc6 --- /dev/null +++ b/docstore/65967db6-78fe-4638-a266-d888d831c0b5 @@ -0,0 +1 @@ +"turn_off_the_lights" } tools = [{ "function_declarations" : [ turn_on_the_lights , turn_off_the_lights ]}] config = { "response_modalities" : [ "TEXT" ], "tools" : tools } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : prompt = "Turn on the lights please" await session . send_client_content ( turns = { "parts" : [{ "text" : prompt }]}) async for chunk in session . receive (): if chunk . server_content : if chunk . text is not None : print ( chunk . text ) elif chunk . tool_call : function_responses = [] for fc in chunk . tool_call . function_calls : function_response = types . FunctionResponse ( id = fc . id , name = fc . name , response = { "result" : "ok" } # simple, hard-coded function response ) function_responses . append ( function_response ) await session . send_tool_response ( function_responses = function_responses ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; // Simple function definitions const turn_on_the_lights = { name : "turn_on_the_lights" } // , description: '...', parameters: { ... } const turn_off_the_lights = { name : "turn_off_the_lights" } const tools = [{ functionDeclarations : [ turn_on_the_lights , turn_off_the_lights ] }] const config = { responseModalities : [ Modality . TEXT ], tools : tools } async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { \ No newline at end of file diff --git a/docstore/659e6977-da7d-4c02-b86c-680119b39234 b/docstore/659e6977-da7d-4c02-b86c-680119b39234 new file mode 100644 index 0000000000000000000000000000000000000000..c1222b1eb00e14a7d2a482f186a5d8fda014fef3 --- /dev/null +++ b/docstore/659e6977-da7d-4c02-b86c-680119b39234 @@ -0,0 +1 @@ +person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own?` ; let thoughts = "" ; let answer = "" ; async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-pro" , contents : prompt , config : { thinkingConfig : { includeThoughts : true , }, }, }); for await ( const chunk of response ) { for ( const part of chunk . candidates [ 0 ]. content . parts ) { if ( ! part . text ) { continue ; } else if ( part . thought ) { if ( ! thoughts ) { console . log ( "Thoughts summary:" ); } console . log ( part . text ); thoughts = thoughts + part . text ; } else { if ( ! answer ) { console . log ( "Answer:" ); } console . log ( part . text ); answer = answer + part . text ; } } } } await main (); Go package main import ( "context" "fmt" "log" "os" "google.golang.org/genai" ) const prompt = ` Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own? ` func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := genai . Text ( prompt ) model := "gemini-2.5-pro" resp := client . Models . GenerateContentStream ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { IncludeThoughts : true , }, }) for chunk := range resp { for _ , part := range chunk . Candidates [ 0 ]. Content . Parts { if len ( part . Text ) == 0 { continue } if part . Thought { fmt . Printf ( "Thought: %s\n" , part . Text ) } else { fmt . Printf ( "Answer: %s\n" , part . Text ) } } } } Thought signatures \ No newline at end of file diff --git a/docstore/65b1c6b5-a31b-4c4d-aeb8-3b0b2ccc1c4b b/docstore/65b1c6b5-a31b-4c4d-aeb8-3b0b2ccc1c4b new file mode 100644 index 0000000000000000000000000000000000000000..72fc1af4a198f3f6ddd1bd2823f7a9c0e142bbbb --- /dev/null +++ b/docstore/65b1c6b5-a31b-4c4d-aeb8-3b0b2ccc1c4b @@ -0,0 +1 @@ +should be sent to flush any cached audio. The client can resume sending audio data at any time. Python # example audio file to try: # URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" # !wget -q $URL -O sample.pcm import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_bytes = Path ( "sample.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) # if stream gets paused, send: # await session.send_realtime_input(audio_stream_end=True) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // example audio file to try: // URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" // !wget -q $URL -O sample.pcm import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . \ No newline at end of file diff --git a/docstore/65c497ec-0aa9-4226-b27a-6a26c22df61a b/docstore/65c497ec-0aa9-4226-b27a-6a26c22df61a new file mode 100644 index 0000000000000000000000000000000000000000..a99dcaa8b13f3b83504ad1b35c796f8fc297615c --- /dev/null +++ b/docstore/65c497ec-0aa9-4226-b27a-6a26c22df61a @@ -0,0 +1 @@ +] # Execute the prompt with specified tools in audio modality await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Multiple tasks example - combining lights, code execution, and search const prompt = ` Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! ` ; const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } // not defined here. ]; // Execute the prompt with specified tools in audio modality await run ( prompt , { tools : tools , modality : "AUDIO" }); Python developers can try this out in the Live API Tool Use notebook . Model context protocol (MCP) Model Context Protocol (MCP) is an open standard for connecting AI applications with external tools and data. MCP provides a common protocol for models to access context, such as functions (tools), data sources (resources), or predefined prompts. The Gemini SDKs have built-in support for the MCP, reducing boilerplate code and offering automatic tool calling for MCP tools. When the model generates an MCP tool call, the Python and JavaScript client SDK can automatically execute the MCP tool and send the response back to the model in a subsequent request, continuing this loop until no more tool calls are made by the model. Here, you can find an example of how to use a local MCP server with Gemini and mcp SDK. Python Make sure the latest version of the mcp SDK is installed on your platform of choice. pip install mcp Note: Python supports automatic tool calling by passing in the ClientSession into the tools parameters. If you want to disable it, you can provide automatic_function_calling with disabled True . import os import asyncio from datetime import datetime from mcp import ClientSession , StdioServerParameters from \ No newline at end of file diff --git a/docstore/65cc2597-4b37-4b2e-beef-8864829fc4fa b/docstore/65cc2597-4b37-4b2e-beef-8864829fc4fa new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/65cc2597-4b37-4b2e-beef-8864829fc4fa @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/65cf028d-fb7e-4a85-b0ca-da860ac6e165 b/docstore/65cf028d-fb7e-4a85-b0ca-da860ac6e165 new file mode 100644 index 0000000000000000000000000000000000000000..2c0be12e5de5e086f593ed916f2fdecc774a45a4 --- /dev/null +++ b/docstore/65cf028d-fb7e-4a85-b0ca-da860ac6e165 @@ -0,0 +1 @@ +Text generation | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Text generation The Gemini API can generate text output from various inputs, including text, images, video, and audio, leveraging Gemini models. Here's a basic example that takes a single text input: Python from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents \ No newline at end of file diff --git a/docstore/65ee61ac-7b49-423b-945b-c2cef34f3d46 b/docstore/65ee61ac-7b49-423b-945b-c2cef34f3d46 new file mode 100644 index 0000000000000000000000000000000000000000..43d235dee43f472c269052714c9705874463b9cd --- /dev/null +++ b/docstore/65ee61ac-7b49-423b-945b-c2cef34f3d46 @@ -0,0 +1 @@ +tools ]) # Define user prompt contents = [ types . Content ( role = "user" , parts = [ types . Part ( text = "Turn the lights down to a romantic level" )] ) ] # Send request with function declarations response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = contents config = config , ) print ( response . candidates [ 0 ] . content . parts [ 0 ] . function_call ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Generation config with function declaration const config = { tools : [{ functionDeclarations : [ setLightValuesFunctionDeclaration ] }] }; // Configure the client const ai = new GoogleGenAI ({}); // Define user prompt const contents = [ { role : 'user' , parts : [{ text : 'Turn the lights down to a romantic level' }] } ]; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( response . functionCalls [ 0 ]); The model then returns a functionCall object in an OpenAPI compatible schema specifying how to call one or more of the declared functions in order to respond to the user's question. Python id = None args = { 'color_temp' : 'warm' , 'brightness' : 25 } name = 'set_light_values' JavaScript { name : 'set_light_values' , args : { brightness : 25 , color_temp : 'warm' } } Step 3: Execute set_light_values function code Extract the function call details from the model's response, parse the arguments , and execute the set_light_values function. Python # Extract tool call details, it may not be in the first part. tool_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call if tool_call . name == "set_light_values" : result = set_light_values ( ** tool_call . args ) print ( f "Function execution result: { result } " ) JavaScript // Extract tool call details const tool_call = response . functionCalls [ 0 ] let result ; if ( tool_call . name === 'set_light_values' ) { result = \ No newline at end of file diff --git a/docstore/6605e154-3e73-464d-8383-e0ecbbaf87e4 b/docstore/6605e154-3e73-464d-8383-e0ecbbaf87e4 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/6605e154-3e73-464d-8383-e0ecbbaf87e4 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/660bc72f-fea5-44a4-97d2-e228ff218142 b/docstore/660bc72f-fea5-44a4-97d2-e228ff218142 new file mode 100644 index 0000000000000000000000000000000000000000..a70be0d04647e3405f67cb87b3156d8922b7c210 --- /dev/null +++ b/docstore/660bc72f-fea5-44a4-97d2-e228ff218142 @@ -0,0 +1 @@ +to the examples provides labels that the model can use when generating the output, which makes it easier to parse output content. In the following example, "Text:" is the input prefix and "The answer is:" is the output prefix. Prompt: Classify the text as one of the following categories. - large - small Text: Rhino The answer is: large Text: Mouse The answer is: small Text: Snail The answer is: small Text: Elephant The answer is: Response: The answer is: large (gemini-2.5-flash) Break down prompts into components For use cases that require complex prompts, you can help the model manage this complexity by breaking things down into simpler components. Break down instructions: Instead of having many instructions in one prompt, create one prompt per instruction. You can choose which prompt to process based on the user's input. Chain prompts: For complex tasks that involve multiple sequential steps, make each step a prompt and chain the prompts together in a sequence. In this sequential chain of prompts, the output of one prompt in the sequence becomes the input of the next prompt. The output of the last prompt in the sequence is the final output. Aggregate responses: Aggregation is when you want to perform different parallel tasks on different portions of the data and aggregate the results to produce the final output. For example, you can tell the model to perform one operation on the first part of the data, perform another operation on the rest of the data and aggregate the results. Experiment with model parameters Each call that you send to a model includes parameter values that control how the model generates a response. The model can generate different results for different parameter values. Experiment with different parameter values to get the best values for the task. The parameters available for different models may differ. The most common parameters are the following: Max output tokens: Specifies the maximum number of tokens that can be generated in the \ No newline at end of file diff --git a/docstore/66147146-c31b-40a4-afea-986bfff14d64 b/docstore/66147146-c31b-40a4-afea-986bfff14d64 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/66147146-c31b-40a4-afea-986bfff14d64 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/661555f0-f826-4e9d-86f5-933e7c97d953 b/docstore/661555f0-f826-4e9d-86f5-933e7c97d953 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/661555f0-f826-4e9d-86f5-933e7c97d953 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/664c9339-8ed4-49c7-9231-b663340f8fd1 b/docstore/664c9339-8ed4-49c7-9231-b663340f8fd1 new file mode 100644 index 0000000000000000000000000000000000000000..c25aa08b43110ad03ba57bdce48865495fdfb941 --- /dev/null +++ b/docstore/664c9339-8ed4-49c7-9231-b663340f8fd1 @@ -0,0 +1 @@ +fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ], "generationConfig": { "thinkingConfig": { "thinkingBudget": 0 } } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } System instructions and other configurations You can guide the behavior of Gemini models with system instructions. To do so, pass a GenerateContentConfig object. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , config = types . GenerateContentConfig ( system_instruction = "You are a cat. Your name is Neko." ), contents = "Hello there" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Hello there" , config : { systemInstruction : "You are a cat. Your name is Neko." , }, }); console . log ( response . text ); } await main (); \ No newline at end of file diff --git a/docstore/665034cd-6b27-468f-b030-f79718fa0ee6 b/docstore/665034cd-6b27-468f-b030-f79718fa0ee6 new file mode 100644 index 0000000000000000000000000000000000000000..9670cc28c51e28ab1129c38367a73e00f33ff60c --- /dev/null +++ b/docstore/665034cd-6b27-468f-b030-f79718fa0ee6 @@ -0,0 +1 @@ +meal features brown rice, roasted vegetables, and chicken teriyaki. The brown rice is a healthy and complex carbohydrate that will give you sustained energy. The roasted vegetables are a great way to get your daily dose of vitamins and minerals, and the chicken teriyaki is a delicious and protein-rich option. This meal is also very easy to prepare. Simply cook the brown rice, roast the vegetables, and cook the chicken teriyaki. Once everything is cooked, divide it into meal prep containers and store them in the refrigerator. You can then grab a container and go on busy mornings! If you're looking for a healthy and delicious meal that can be easily prepped ahead of time, this meal is a great option. It's packed with nutrients and flavor, and it's sure to keep you feeling full and satisfied. Here's to healthy and delicious meal prepping! If you are having trouble getting the output you want from prompts that use media files, there are some strategies that can help you get the results you want. The following sections provide design approaches and troubleshooting tips for improving prompts that use multimodal input. You can improve your multimodal prompts by following these best practices: Prompt design fundamentals Be specific in your instructions : Craft clear and concise instructions that leave minimal room for misinterpretation. Add a few examples to your prompt: Use realistic few-shot examples to illustrate what you want to achieve. Break it down step-by-step : Divide complex tasks into manageable sub-goals, guiding the model through the process. Specify the output format : In your prompt, ask for the output to be in the format you want, like markdown, JSON, HTML and more. Put your image first for single-image prompts : While Gemini can handle image and text inputs in any order, for prompts containing a single image, it might perform better if that image (or video) is placed before the text prompt. However, for prompts that require images to be highly interleaved \ No newline at end of file diff --git a/docstore/66643608-617a-410d-8875-12da53cfc4cc b/docstore/66643608-617a-410d-8875-12da53cfc4cc new file mode 100644 index 0000000000000000000000000000000000000000..a3a5b12622da56afcf66e8d09d0c1c7555dea0d4 --- /dev/null +++ b/docstore/66643608-617a-410d-8875-12da53cfc4cc @@ -0,0 +1 @@ +"gemini-live-2.5-flash-preview" tools = [{ 'google_search' : {}}] config = { "response_modalities" : [ "TEXT" ], "tools" : tools } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : prompt = "When did the last Brazil vs. Argentina soccer match happen?" await session . send_client_content ( turns = { "parts" : [{ "text" : prompt }]}) async for chunk in session . receive (): if chunk . server_content : if chunk . text is not None : print ( chunk . text ) # The model might generate and execute Python code to use Search model_turn = chunk . server_content . model_turn if model_turn : for part in model_turn . parts : if part . executable_code is not None : print ( part . executable_code . code ) if part . code_execution_result is not None : print ( part . code_execution_result . output ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const tools = [{ googleSearch : {}}] const config = { responseModalities : [ Modality . TEXT ], tools : tools } async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } else if ( message . toolCall ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : \ No newline at end of file diff --git a/docstore/666779c5-1fcd-4e81-bda3-692cb5cd89fc b/docstore/666779c5-1fcd-4e81-bda3-692cb5cd89fc new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/666779c5-1fcd-4e81-bda3-692cb5cd89fc @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/66ac94a2-a06e-4770-b49f-2428c6610b95 b/docstore/66ac94a2-a06e-4770-b49f-2428c6610b95 new file mode 100644 index 0000000000000000000000000000000000000000..ca0d44e9961a5c85cb8aeb7b443a141c9784fb0d --- /dev/null +++ b/docstore/66ac94a2-a06e-4770-b49f-2428c6610b95 @@ -0,0 +1 @@ +Image understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image understanding Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models. Tip: In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer improved accuracy for specific use cases like object detection and segmentation , through additional training. See the Capabilities section for more details. Passing images to Gemini You can provide images as input to Gemini using two methods: Passing inline image data : Ideal for smaller files (total request size less than 20MB, including prompts). Uploading images using the File API : Recommended for larger files or for reusing images across multiple requests. Passing inline image data You can pass inline image data in the request to generateContent . You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language). The following example shows how to read an image from a local file and pass it to generateContent API for processing. Python from google.genai import types with open ( 'path/to/small-sample.jpg' , 'rb' ) as f : image_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ types . Part . from_bytes ( data = image_bytes , mime_type = 'image/jpeg' , ), 'Caption this image.' ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs \ No newline at end of file diff --git a/docstore/66b494b0-c950-43b7-a9e9-74e81ab8bb0f b/docstore/66b494b0-c950-43b7-a9e9-74e81ab8bb0f new file mode 100644 index 0000000000000000000000000000000000000000..8ae055ee25ee10e0bf5368b2d5c01f7fd2abd6b2 --- /dev/null +++ b/docstore/66b494b0-c950-43b7-a9e9-74e81ab8bb0f @@ -0,0 +1 @@ +model can then retrieve content from the URLs and use that content to inform and shape its response. You can try examples of using tools with thinking models in the Thinking cookbook . What's next? To work through more in depth examples, like: Using tools with thinking Streaming with thinking Adjusting the thinking budget for different results and more, try our Thinking cookbook . Thinking coverage is now available in our OpenAI Compatibility guide. For more info about Gemini 2.5 Pro, Gemini Flash 2.5, and Gemini 2.5 Flash-Lite, visit the model page . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-11 UTC. \ No newline at end of file diff --git a/docstore/66b700e0-deed-4cce-bc07-6143adfa4059 b/docstore/66b700e0-deed-4cce-bc07-6143adfa4059 new file mode 100644 index 0000000000000000000000000000000000000000..56f7dcc9bceb6e3744499a44a4d77c57b76426e8 --- /dev/null +++ b/docstore/66b700e0-deed-4cce-bc07-6143adfa4059 @@ -0,0 +1 @@ += "gemini-2.5-flash" , contents = [ "Explain how AI works" ], config = types . GenerateContentConfig ( temperature = 0.1 ) ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , config : { temperature : 0.1 , }, }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } temp := float32 ( 0.9 ) topP := float32 ( 0.5 ) topK := float32 ( 20.0 ) config := & genai . GenerateContentConfig { Temperature : & temp , TopP : & topP , TopK : & topK , ResponseMIMEType : "application/json" , } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "What is the average size of a swallow?" ), config , ) fmt . Println ( result . Text ()) } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ], "generationConfig": { "stopSequences": [ "Title" ], "temperature": 1.0, "topP": 0.8, "topK": 10 } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const generationConfig = { temperature : 1 , topP : 0.95 , topK : 40 , responseMimeType : 'text/plain' , }; const payload = { generationConfig , contents : [ { parts : [ { text : 'Explain how AI works in a few words' }, ], }, ], }; const url = \ No newline at end of file diff --git a/docstore/66cea390-936a-4413-9553-1fa0ebdae2f3 b/docstore/66cea390-936a-4413-9553-1fa0ebdae2f3 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/66cea390-936a-4413-9553-1fa0ebdae2f3 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/66d30ea7-950b-451b-b67f-668d08d30ca8 b/docstore/66d30ea7-950b-451b-b67f-668d08d30ca8 new file mode 100644 index 0000000000000000000000000000000000000000..90311e83465fc930174e1787a7822757d628fc0a --- /dev/null +++ b/docstore/66d30ea7-950b-451b-b67f-668d08d30ca8 @@ -0,0 +1 @@ +setLightValues ( tool_call . args . brightness , tool_call . args . color_temp ); console . log ( `Function execution result: ${ JSON . stringify ( result ) } ` ); } Step 4: Create user friendly response with function result and call the model again Finally, send the result of the function execution back to the model so it can incorporate this information into its final response to the user. Python # Create a function response part function_response_part = types . Part . from_function_response ( name = tool_call . name , response = { "result" : result }, ) # Append function call and result of the function execution to contents contents . append ( response . candidates [ 0 ] . content ) # Append the content from the model's response. contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Create a function response part const function_response_part = { name : tool_call . name , response : { result } } // Append function call and result of the function execution to contents contents . push ( response . candidates [ 0 ]. content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); // Get the final response from the model const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); This completes the function calling flow. The model successfully used the set_light_values function to perform the request action of the user. Function declarations When you implement function calling in a prompt, you create a tools object, which contains one or more function declarations . You define functions using JSON, specifically with a select subset of the OpenAPI schema format. A single function \ No newline at end of file diff --git a/docstore/66d6bd32-9b97-4bbc-855f-cc6bb789cc0b b/docstore/66d6bd32-9b97-4bbc-855f-cc6bb789cc0b new file mode 100644 index 0000000000000000000000000000000000000000..eb9021b0c62d8225572ffa88a8581263b0a8ad78 --- /dev/null +++ b/docstore/66d6bd32-9b97-4bbc-855f-cc6bb789cc0b @@ -0,0 +1 @@ +and their capabilities, visit the Models page. Best practices Prompting tips For basic text generation, a zero-shot prompt often suffices without needing examples, system instructions or specific formatting. For more tailored outputs: Use System instructions to guide the model. Provide few example inputs and outputs to guide the model. This is often referred to as few-shot prompting. Consult our prompt engineering guide for more tips. Structured output In some cases, you may need structured output, such as JSON. Refer to our structured output guide to learn how. What's next Try the Gemini API getting started Colab . Explore Gemini's image , video , audio and document understanding capabilities. Learn about multimodal file prompting strategies . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/66db9702-1aaf-4ea5-8ac3-19efdd09423e b/docstore/66db9702-1aaf-4ea5-8ac3-19efdd09423e new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/66db9702-1aaf-4ea5-8ac3-19efdd09423e @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/66e37706-85df-40bc-b1c3-62306eca6812 b/docstore/66e37706-85df-40bc-b1c3-62306eca6812 new file mode 100644 index 0000000000000000000000000000000000000000..71a40ddf8f5869779c79b3a864f86f411dc17d60 --- /dev/null +++ b/docstore/66e37706-85df-40bc-b1c3-62306eca6812 @@ -0,0 +1 @@ +correct reasoning steps afterward. To disambiguate between those reasons, ask the model to describe what's in the image. In the following example, if the model responds with a snack that seems surprising when paired with tea (e.g. popcorn), you can first troubleshoot to determine whether the model correctly recognized that the image contains tea. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? Describe what's in this image. Another strategy is to ask the model to explain its reasoning. That can help you narrow down which part of the reasoning broke down, if any. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? What's a snack I can make in 1 minute that would go well with this? Please explain why. What's next Try writing your own multimodal prompts using Google AI Studio . For information on using the Gemini Files API for uploading media files and including them in your prompts, see the Vision , Audio , and Document processing guides. For more guidance on prompt design, like tuning sampling parameters, see the Prompt strategies page. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/66e565d4-80ea-41da-b815-539bc843b587 b/docstore/66e565d4-80ea-41da-b815-539bc843b587 new file mode 100644 index 0000000000000000000000000000000000000000..853decdfe946c476f8e5126e214d1b8cf3bf0fa7 --- /dev/null +++ b/docstore/66e565d4-80ea-41da-b815-539bc843b587 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/embeddings Title: Embeddings | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/66e86545-1630-4e77-8da1-d48fe1783b62 b/docstore/66e86545-1630-4e77-8da1-d48fe1783b62 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/66e86545-1630-4e77-8da1-d48fe1783b62 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/66eeeb32-6c85-4f0d-816a-e5bd98684c50 b/docstore/66eeeb32-6c85-4f0d-816a-e5bd98684c50 new file mode 100644 index 0000000000000000000000000000000000000000..6d0fb3dda30537d366b10ce141412a20984aa796 --- /dev/null +++ b/docstore/66eeeb32-6c85-4f0d-816a-e5bd98684c50 @@ -0,0 +1 @@ +where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels. """ config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # set thinking_budget to 0 for better results in object detection ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ prompt , im ], # Pillow images can be directly passed as inputs (which will be converted by the SDK) config = config ) # Parse JSON response items = json . loads ( parse_json ( response . text )) # Create output directory os . makedirs ( output_dir , exist_ok = True ) # Process each mask for i , item in enumerate ( items ): # Get bounding box coordinates box = item [ "box_2d" ] y0 = int ( box [ 0 ] / 1000 * im . size [ 1 ]) x0 = int ( box [ 1 ] / 1000 * im . size [ 0 ]) y1 = int ( box [ 2 ] / 1000 * im . size [ 1 ]) x1 = int ( box [ 3 ] / 1000 * im . size [ 0 ]) # Skip invalid boxes if y0 > = y1 or x0 > = x1 : continue # Process mask png_str = item [ "mask" ] if not png_str . startswith ( "data:image/png;base64," ): continue # Remove prefix png_str = png_str . removeprefix ( "data:image/png;base64," ) mask_data = base64 . b64decode ( png_str ) mask = Image . open ( io . BytesIO ( mask_data )) # Resize mask to match bounding box mask = mask . resize (( x1 - x0 , y1 - y0 ), Image . Resampling . BILINEAR ) # Convert mask to numpy array for processing mask_array = np . array ( mask ) # Create overlay for this mask overlay = Image . new ( 'RGBA' , im . size , ( 0 , 0 , 0 , 0 )) overlay_draw = ImageDraw . Draw ( overlay ) # Create overlay for the mask color = ( 255 , 255 , 255 , 200 ) for y in range ( y0 , y1 ): for x in range ( x0 , x1 ): if mask_array [ y - y0 , x - x0 ] > 128 : # Threshold for mask overlay_draw . point (( x , y ), fill = color ) # Save individual mask and its overlay mask_filename = f " { item [ 'label' ] } _ { i } _mask.png" \ No newline at end of file diff --git a/docstore/66f22f0a-476e-4033-a9ba-2a16cd223773 b/docstore/66f22f0a-476e-4033-a9ba-2a16cd223773 new file mode 100644 index 0000000000000000000000000000000000000000..4698c2cf5d2dc524303259a813fe032a26136eee --- /dev/null +++ b/docstore/66f22f0a-476e-4033-a9ba-2a16cd223773 @@ -0,0 +1 @@ +blurring the background into a sea of neon colors and indistinct shadows, creating a sense of urgency and isolation. A more detailed prompt results in a video that is more focused with a richer environment. A video with smooth motion that dollies in on a desperate man in a green trench coat, using a vintage rotary phone against a wall bathed in an eerie green neon glow. The camera starts from a medium distance, slowly moving closer to the man's face, revealing his frantic expression and the sweat on his brow as he urgently dials the phone. The focus is on the man's hands, his fingers fumbling with the dial as he desperately tries to connect. The green neon light casts long shadows on the wall, adding to the tense atmosphere. The scene is framed to emphasize the isolation and desperation of the man, highlighting the stark contrast between the vibrant glow of the neon and the man's grim determination. Adding more detail gives the subject a realistic expression and creates an intense and vibrant scene. Snow leopard This example demonstrates the output Veo might generate for a simple prompt. Prompt Generated output A cute creature with snow leopard-like fur is walking in winter forest, 3D cartoon style render. Running snow leopard This prompt has more detail and demonstrates generated output that might be closer to what you want in your video. Prompt Generated output Create a short 3D animated scene in a joyful cartoon style. A cute creature with snow leopard-like fur, large expressive eyes, and a friendly, rounded form happily prances through a whimsical winter forest. The scene should feature rounded, snow-covered trees, gentle falling snowflakes, and warm sunlight filtering through the branches. The creature's bouncy movements and wide smile should convey pure delight. Aim for an upbeat, heartwarming tone with bright, cheerful colors and playful animation. Examples by writing elements These examples show you how to refine your prompts by each basic element. Subject \ No newline at end of file diff --git a/docstore/66f6e8d1-6eb6-49ab-8a59-e894dd2e171e b/docstore/66f6e8d1-6eb6-49ab-8a59-e894dd2e171e new file mode 100644 index 0000000000000000000000000000000000000000..b56dfbca2298dddb94eb66ba2a6b5e2a4d7109fa --- /dev/null +++ b/docstore/66f6e8d1-6eb6-49ab-8a59-e894dd2e171e @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "' " ${ MIME1_TYPE } " '", "file_uri": ' $file1_uri '}}, { "inline_data": { "mime_type":"' " ${ MIME2_TYPE } " '", "data": "' " $IMAGE2_BASE64 " '" } } ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Object detection From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. Python from google import genai from google.genai import types from PIL import Image import json client = genai . Client () prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image . open ( "/path/to/image.png" ) config = types . GenerateContentConfig ( response_mime_type = "application/json" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , prompt ], config = config ) width , height = image . size bounding_boxes = json . loads ( response . text ) converted_bounding_boxes = [] for bounding_box in bounding_boxes : abs_y1 = int ( bounding_box [ "box_2d" ][ 0 ] / 1000 * height ) abs_x1 = int ( bounding_box [ "box_2d" ][ 1 ] / 1000 * width ) abs_y2 = int ( bounding_box [ "box_2d" ][ 2 ] / 1000 * height ) abs_x2 = int ( bounding_box [ "box_2d" ][ 3 ] / 1000 * width ) converted_bounding_boxes . append ([ abs_x1 , abs_y1 , abs_x2 , abs_y2 ]) print ( "Image size: " , width , height ) print ( "Bounding boxes:" , converted_bounding_boxes ) Note: The model also supports generating bounding boxes based on custom \ No newline at end of file diff --git a/docstore/670479a2-c084-403f-a965-ec740e1b4440 b/docstore/670479a2-c084-403f-a965-ec740e1b4440 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/670479a2-c084-403f-a965-ec740e1b4440 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/671a1162-e19d-4618-9342-66736bb7d4b3 b/docstore/671a1162-e19d-4618-9342-66736bb7d4b3 new file mode 100644 index 0000000000000000000000000000000000000000..6a8a77c3ec0b2f12317f225d20ed3ea5b03e9f67 --- /dev/null +++ b/docstore/671a1162-e19d-4618-9342-66736bb7d4b3 @@ -0,0 +1 @@ +JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); const prompt = "Write a story about a magic backpack." ; const result = await model . generateContentStream ( prompt ); // Print text as it comes in. for await ( const chunk of result . stream ) { const chunkText = chunk . text (); process . stdout . write ( chunkText ); } Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( "gemini-1.5-flash" ) iter := model . GenerateContentStream ( ctx , genai . Text ( "Write a story about a magic backpack." )) for { resp , err := iter . Next () if err == iterator . Done { break } if err != nil { log . Fatal ( err ) } printResponse ( resp ) // utility for printing the response } After Python from google import genai client = genai . Client () for chunk in client . models . generate_content_stream ( model = 'gemini-2.0-flash' , contents = 'Tell me a story in 300 words.' ): print ( chunk . text ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContentStream ({ model : "gemini-2.0-flash" , contents : "Write a story about a magic backpack." , }); let text = "" ; for await ( const chunk of response ) { console . log ( chunk . text ); text += chunk . text ; } Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } for result , err := range client . Models . GenerateContentStream ( ctx , "gemini-2.0-flash" , genai . Text ( "Write a story about a magic backpack." ), nil , ) { if err != nil { log . Fatal ( err ) } fmt . Print ( result . Candidates [ 0 ]. Content . Parts [ 0 ]. Text ) } Configuration \ No newline at end of file diff --git a/docstore/672062ee-ef15-4d3d-8319-b1eb174acf07 b/docstore/672062ee-ef15-4d3d-8319-b1eb174acf07 new file mode 100644 index 0000000000000000000000000000000000000000..71a40ddf8f5869779c79b3a864f86f411dc17d60 --- /dev/null +++ b/docstore/672062ee-ef15-4d3d-8319-b1eb174acf07 @@ -0,0 +1 @@ +correct reasoning steps afterward. To disambiguate between those reasons, ask the model to describe what's in the image. In the following example, if the model responds with a snack that seems surprising when paired with tea (e.g. popcorn), you can first troubleshoot to determine whether the model correctly recognized that the image contains tea. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? Describe what's in this image. Another strategy is to ask the model to explain its reasoning. That can help you narrow down which part of the reasoning broke down, if any. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? What's a snack I can make in 1 minute that would go well with this? Please explain why. What's next Try writing your own multimodal prompts using Google AI Studio . For information on using the Gemini Files API for uploading media files and including them in your prompts, see the Vision , Audio , and Document processing guides. For more guidance on prompt design, like tuning sampling parameters, see the Prompt strategies page. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/6737de94-7d83-47d3-87b3-fb390c460f10 b/docstore/6737de94-7d83-47d3-87b3-fb390c460f10 new file mode 100644 index 0000000000000000000000000000000000000000..a3a5b12622da56afcf66e8d09d0c1c7555dea0d4 --- /dev/null +++ b/docstore/6737de94-7d83-47d3-87b3-fb390c460f10 @@ -0,0 +1 @@ +"gemini-live-2.5-flash-preview" tools = [{ 'google_search' : {}}] config = { "response_modalities" : [ "TEXT" ], "tools" : tools } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : prompt = "When did the last Brazil vs. Argentina soccer match happen?" await session . send_client_content ( turns = { "parts" : [{ "text" : prompt }]}) async for chunk in session . receive (): if chunk . server_content : if chunk . text is not None : print ( chunk . text ) # The model might generate and execute Python code to use Search model_turn = chunk . server_content . model_turn if model_turn : for part in model_turn . parts : if part . executable_code is not None : print ( part . executable_code . code ) if part . code_execution_result is not None : print ( part . code_execution_result . output ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const tools = [{ googleSearch : {}}] const config = { responseModalities : [ Modality . TEXT ], tools : tools } async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } else if ( message . toolCall ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : \ No newline at end of file diff --git a/docstore/67396caf-a147-4216-8bf5-40ee38ab692d b/docstore/67396caf-a147-4216-8bf5-40ee38ab692d new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/67396caf-a147-4216-8bf5-40ee38ab692d @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/673a8fce-363a-46b6-9cb5-d07eb757c9b5 b/docstore/673a8fce-363a-46b6-9cb5-d07eb757c9b5 new file mode 100644 index 0000000000000000000000000000000000000000..3d32a6c6f44782138d2600dc9a5e7c5bf75a9a24 --- /dev/null +++ b/docstore/673a8fce-363a-46b6-9cb5-d07eb757c9b5 @@ -0,0 +1 @@ +in 3 sentences."}, { "file_data": { "file_uri": "https://www.youtube.com/watch?v=9hE5-98ZeCg" } } ] }] }' 2 > /dev/null Refer to timestamps in the content You can ask questions about specific points in time within the video using timestamps of the form MM:SS . Python prompt = "What are the examples given at 00:05 and 00:10 supposed to show us?" # Adjusted timestamps for the NASA video JavaScript const prompt = "What are the examples given at 00:05 and 00:10 supposed to show us?" ; Go prompt := [] * genai . Part { genai . NewPartFromURI ( currentVideoFile . URI , currentVideoFile . MIMEType ), // Adjusted timestamps for the NASA video genai . NewPartFromText ( "What are the examples given at 00:05 and " + "00:10 supposed to show us?" ), } REST PROMPT = "What are the examples given at 00:05 and 00:10 supposed to show us?" Transcribe video and provide visual descriptions The Gemini models can transcribe and provide visual descriptions of video content by processing both the audio track and visual frames. For visual descriptions, the model samples the video at a rate of 1 frame per second . This sampling rate may affect the level of detail in the descriptions, particularly for videos with rapidly changing visuals. Python prompt = "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions." JavaScript const prompt = "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions." ; Go prompt := [] * genai . Part { genai . NewPartFromURI ( currentVideoFile . URI , currentVideoFile . MIMEType ), genai . NewPartFromText ( "Transcribe the audio from this video, giving timestamps for salient events in the video. Also " + "provide visual descriptions." ), } REST PROMPT = "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions." Customize video processing You can customize video processing \ No newline at end of file diff --git a/docstore/673bcee8-c51d-4253-8a8c-a49dc9817597 b/docstore/673bcee8-c51d-4253-8a8c-a49dc9817597 new file mode 100644 index 0000000000000000000000000000000000000000..e5344d780665917318a495b590674a962d80f8ec --- /dev/null +++ b/docstore/673bcee8-c51d-4253-8a8c-a49dc9817597 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/audio#main-content Title: Audio understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/67605844-5900-45e2-a657-79a11335ad2c b/docstore/67605844-5900-45e2-a657-79a11335ad2c new file mode 100644 index 0000000000000000000000000000000000000000..086087b92745c4e9b643012a145762582d8d8dd5 --- /dev/null +++ b/docstore/67605844-5900-45e2-a657-79a11335ad2c @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "What' \' 's the temperature in London?" } ] } ], "tools": [ { "functionDeclarations": [ { "name": "get_current_temperature", "description": "Gets the current temperature for a given location.", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "The city name, e.g. San Francisco" } }, "required": ["location"] } } ] } ] }' How function calling works Function calling involves a structured interaction between your application, the model, and external functions. Here's a breakdown of the process: Define Function Declaration: Define the function declaration in your application code. Function Declarations describe the function's name, parameters, and purpose to the model. Call LLM with function declarations: Send user prompt along with the function declaration(s) to the model. It analyzes the request and determines if a function call would be helpful. If so, it responds with a structured JSON object. Execute Function Code (Your Responsibility): The Model does not execute the function itself. It's your application's responsibility to process the response and check for Function Call, if Yes : Extract the name and args of the function and execute the corresponding function in your application. No: The model has provided a direct text response to the prompt (this flow is less emphasized in the example but is a possible outcome). Create User friendly response: If a function was executed, capture the result and send it back to the model in a subsequent turn of the conversation. It will use the result to generate a final, user-friendly response that incorporates the information from the function call. This process can be repeated over multiple turns, allowing for complex interactions and \ No newline at end of file diff --git a/docstore/67623e23-0a2a-49db-8f1c-85084d5d751b b/docstore/67623e23-0a2a-49db-8f1c-85084d5d751b new file mode 100644 index 0000000000000000000000000000000000000000..a70be0d04647e3405f67cb87b3156d8922b7c210 --- /dev/null +++ b/docstore/67623e23-0a2a-49db-8f1c-85084d5d751b @@ -0,0 +1 @@ +to the examples provides labels that the model can use when generating the output, which makes it easier to parse output content. In the following example, "Text:" is the input prefix and "The answer is:" is the output prefix. Prompt: Classify the text as one of the following categories. - large - small Text: Rhino The answer is: large Text: Mouse The answer is: small Text: Snail The answer is: small Text: Elephant The answer is: Response: The answer is: large (gemini-2.5-flash) Break down prompts into components For use cases that require complex prompts, you can help the model manage this complexity by breaking things down into simpler components. Break down instructions: Instead of having many instructions in one prompt, create one prompt per instruction. You can choose which prompt to process based on the user's input. Chain prompts: For complex tasks that involve multiple sequential steps, make each step a prompt and chain the prompts together in a sequence. In this sequential chain of prompts, the output of one prompt in the sequence becomes the input of the next prompt. The output of the last prompt in the sequence is the final output. Aggregate responses: Aggregation is when you want to perform different parallel tasks on different portions of the data and aggregate the results to produce the final output. For example, you can tell the model to perform one operation on the first part of the data, perform another operation on the rest of the data and aggregate the results. Experiment with model parameters Each call that you send to a model includes parameter values that control how the model generates a response. The model can generate different results for different parameter values. Experiment with different parameter values to get the best values for the task. The parameters available for different models may differ. The most common parameters are the following: Max output tokens: Specifies the maximum number of tokens that can be generated in the \ No newline at end of file diff --git a/docstore/67626ab6-e6f3-43a4-aa29-3c0897b36803 b/docstore/67626ab6-e6f3-43a4-aa29-3c0897b36803 new file mode 100644 index 0000000000000000000000000000000000000000..c1215478fcfc0a791e03023378a8264733510dd8 --- /dev/null +++ b/docstore/67626ab6-e6f3-43a4-aa29-3c0897b36803 @@ -0,0 +1 @@ +connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.pcm" ); const base64Audio = Buffer . from ( fileBuffer ). toString ( 'base64' ); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); // if stream gets paused, send: // session.sendRealtimeInput({ audioStreamEnd: true }) const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); With send_realtime_input , the API will respond to audio automatically based on VAD. While send_client_content adds messages to the model context in order, send_realtime_input is optimized for responsiveness at the expense of deterministic ordering. Automatic VAD configuration For more control over the VAD activity, you can configure the following parameters. See API reference for more info. Python from google.genai import types config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : False , # default "start_of_speech_sensitivity" : types . StartSensitivity . START_SENSITIVITY_LOW , "end_of_speech_sensitivity" : types . EndSensitivity . END_SENSITIVITY_LOW , "prefix_padding_ms" : 20 , "silence_duration_ms" : 100 , } } } JavaScript import { GoogleGenAI , Modality , StartSensitivity , EndSensitivity } from '@google/genai' ; const config = { responseModalities : [ Modality . \ No newline at end of file diff --git a/docstore/67825f0a-c71e-40f4-8e45-105fc282bf3f b/docstore/67825f0a-c71e-40f4-8e45-105fc282bf3f new file mode 100644 index 0000000000000000000000000000000000000000..bdac38d149644927b7d126aee3930efd52a696eb --- /dev/null +++ b/docstore/67825f0a-c71e-40f4-8e45-105fc282bf3f @@ -0,0 +1 @@ +turn_off_the_lights_schema = { 'name' : 'turn_off_the_lights' } prompt = """ Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? """ tools = [ { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} ] await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Light control schemas const turnOnTheLightsSchema = { name : 'turn_on_the_lights' }; const turnOffTheLightsSchema = { name : 'turn_off_the_lights' }; const prompt = ` Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? ` ; const tools = [ { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } ]; await run ( prompt , tools = tools , modality = "AUDIO" ) Function calling modes The Gemini API lets you control how the model uses the provided tools (function declarations). Specifically, you can set the mode within the. function_calling_config . AUTO (Default) : The model decides whether to generate a natural language response or suggest a function call based on the prompt and context. This is the most flexible mode and recommended for most scenarios. ANY : The model is constrained to always predict a function call and guarantees function schema adherence. If allowed_function_names is not specified, the model can choose from any of the provided function declarations. If allowed_function_names is provided as a list, the model can only choose from the functions in that list. Use this mode when you require a function call response to every prompt (if applicable). NONE : The model is prohibited from making function calls. This is equivalent to sending a request without any function declarations. Use this to temporarily disable function calling without removing your tool definitions. Python from google.genai import types # Configure function calling mode tool_config = types . ToolConfig ( \ No newline at end of file diff --git a/docstore/67894ea0-d417-4caa-8ebf-02be9d744ea6 b/docstore/67894ea0-d417-4caa-8ebf-02be9d744ea6 new file mode 100644 index 0000000000000000000000000000000000000000..b56dfbca2298dddb94eb66ba2a6b5e2a4d7109fa --- /dev/null +++ b/docstore/67894ea0-d417-4caa-8ebf-02be9d744ea6 @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "' " ${ MIME1_TYPE } " '", "file_uri": ' $file1_uri '}}, { "inline_data": { "mime_type":"' " ${ MIME2_TYPE } " '", "data": "' " $IMAGE2_BASE64 " '" } } ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Object detection From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. Python from google import genai from google.genai import types from PIL import Image import json client = genai . Client () prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image . open ( "/path/to/image.png" ) config = types . GenerateContentConfig ( response_mime_type = "application/json" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , prompt ], config = config ) width , height = image . size bounding_boxes = json . loads ( response . text ) converted_bounding_boxes = [] for bounding_box in bounding_boxes : abs_y1 = int ( bounding_box [ "box_2d" ][ 0 ] / 1000 * height ) abs_x1 = int ( bounding_box [ "box_2d" ][ 1 ] / 1000 * width ) abs_y2 = int ( bounding_box [ "box_2d" ][ 2 ] / 1000 * height ) abs_x2 = int ( bounding_box [ "box_2d" ][ 3 ] / 1000 * width ) converted_bounding_boxes . append ([ abs_x1 , abs_y1 , abs_x2 , abs_y2 ]) print ( "Image size: " , width , height ) print ( "Bounding boxes:" , converted_bounding_boxes ) Note: The model also supports generating bounding boxes based on custom \ No newline at end of file diff --git a/docstore/67dceecf-af92-401b-941a-2cb6385ccfbf b/docstore/67dceecf-af92-401b-941a-2cb6385ccfbf new file mode 100644 index 0000000000000000000000000000000000000000..dfeae8fcf584330ed11cdd48e07105d5f4f56b31 --- /dev/null +++ b/docstore/67dceecf-af92-401b-941a-2cb6385ccfbf @@ -0,0 +1 @@ +retrieval_tool ] ) response = client . models . generate_content ( model = 'gemini-1.5-flash' , contents = "Who won the euro 2024?" , config = config , ) print ( response . text ) if not response . candidates [ 0 ] . grounding_metadata : print ( " \n Model answered from its own knowledge." ) JavaScript // Note: This is a legacy approach for Gemini 1.5 models. // The 'googleSearch' tool is recommended for all new development. import { GoogleGenAI , DynamicRetrievalConfigMode } from "@google/genai" ; const ai = new GoogleGenAI ({}); const retrievalTool = { googleSearchRetrieval : { dynamicRetrievalConfig : { mode : DynamicRetrievalConfigMode . MODE_DYNAMIC , dynamicThreshold : 0.7 , // Only search if confidence > 70% }, }, }; const config = { tools : [ retrievalTool ], }; const response = await ai . models . generateContent ({ model : "gemini-1.5-flash" , contents : "Who won the euro 2024?" , config , }); console . log ( response . text ); if ( ! response . candidates ? .[ 0 ] ? . groundingMetadata ) { console . log ( "\nModel answered from its own knowledge." ); } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -X POST \ -d '{ "contents": [ {"parts": [{"text": "Who won the euro 2024?"}]} ], "tools": [{ "google_search_retrieval": { "dynamic_retrieval_config": { "mode": "MODE_DYNAMIC", "dynamic_threshold": 0.7 } } }] }' What's next Try the Grounding with Google Search in the Gemini API Cookbook . Learn about other available tools, like Function Calling . Learn how to augment prompts with specific URLs using the URL context tool . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. \ No newline at end of file diff --git a/docstore/68097572-a7b6-4606-af6e-a872d7011348 b/docstore/68097572-a7b6-4606-af6e-a872d7011348 new file mode 100644 index 0000000000000000000000000000000000000000..097e48b20f2cbfa1b05db2a0f80e7f3c1583707a --- /dev/null +++ b/docstore/68097572-a7b6-4606-af6e-a872d7011348 @@ -0,0 +1 @@ +Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Gemini Developer API Get a Gemini API Key Get a Gemini API key and make your first API request in minutes. Python from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Explain how AI works in a few words" , ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works in a few words" , }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil )) if err != nil { log . Fatal ( err ) } result , err := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) if err != nil { log . Fatal ( err ) } fmt . Println ( result . Text ()) } Java package com.example ; import com.google.genai.Client ; import com.google.genai.types.GenerateContentResponse ; public class GenerateTextFromTextInput { public static void main ( String [] args ) { Client client = new Client (); GenerateContentResponse response = client . models . generateContent ( "gemini-2.5-flash" , "Explain how AI works in a few words" , null ); System . out . println ( response . text ()); } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H \ No newline at end of file diff --git a/docstore/6823c021-a166-43f0-9029-b18943512b6a b/docstore/6823c021-a166-43f0-9029-b18943512b6a new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/6823c021-a166-43f0-9029-b18943512b6a @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/6827c2e1-0929-44f5-bd6b-a4f4a216d30d b/docstore/6827c2e1-0929-44f5-bd6b-a4f4a216d30d new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/6827c2e1-0929-44f5-bd6b-a4f4a216d30d @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/684ad1fc-17e6-4c55-98f6-4d8131292493 b/docstore/684ad1fc-17e6-4c55-98f6-4d8131292493 new file mode 100644 index 0000000000000000000000000000000000000000..38b0d511edf99b6e46520aefe0a8854155d4053f --- /dev/null +++ b/docstore/684ad1fc-17e6-4c55-98f6-4d8131292493 @@ -0,0 +1 @@ +energetic music, and dimmed the lights to 50% brightness. Let's get this party started! Compositional function calling Compositional or sequential function calling allows Gemini to chain multiple function calls together to fulfill a complex request. For example, to answer "Get the temperature in my current location", the Gemini API might first invoke a get_current_location() function followed by a get_weather() function that takes the location as a parameter. The following example demonstrates how to implement compositional function calling using the Python SDK and automatic function calling. Python This example uses the automatic function calling feature of the google-genai Python SDK. The SDK automatically converts the Python functions to the required schema, executes the function calls when requested by the model, and sends the results back to the model to complete the task. import os from google import genai from google.genai import types # Example Functions def get_weather_forecast ( location : str ) - > dict : """Gets the current weather temperature for a given location.""" print ( f "Tool Call: get_weather_forecast(location= { location } )" ) # TODO: Make API call print ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ) return { "temperature" : 25 , "unit" : "celsius" } # Dummy response def set_thermostat_temperature ( temperature : int ) - > dict : """Sets the thermostat to a desired temperature.""" print ( f "Tool Call: set_thermostat_temperature(temperature= { temperature } )" ) # TODO: Interact with a thermostat API print ( "Tool Response: {'status': 'success'}" ) return { "status" : "success" } # Configure the client and model client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_weather_forecast , set_thermostat_temperature ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to \ No newline at end of file diff --git a/docstore/688f10b6-8734-4366-aa87-4666fc92227b b/docstore/688f10b6-8734-4366-aa87-4666fc92227b new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/688f10b6-8734-4366-aa87-4666fc92227b @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/68902522-830d-43c8-ba46-5e7fe77cbd48 b/docstore/68902522-830d-43c8-ba46-5e7fe77cbd48 new file mode 100644 index 0000000000000000000000000000000000000000..70023e78d7b8c9459310371555beec5ba6328e28 --- /dev/null +++ b/docstore/68902522-830d-43c8-ba46-5e7fe77cbd48 @@ -0,0 +1 @@ +results reflects a single function call that the model has requested. To send the results back, include the responses in the same order as they were requested. The Python SDK supports automatic function calling , which automatically converts Python functions to declarations, handles the function call execution and response cycle for you. Following is an example for the disco use case. Note: Automatic Function Calling is a Python SDK only feature at the moment. Python from google import genai from google.genai import types # Actual function implementations def power_disco_ball_impl ( power : bool ) - > dict : """Powers the spinning disco ball. Args: power: Whether to turn the disco ball on or off. Returns: A status dictionary indicating the current state. """ return { "status" : f "Disco ball powered { 'on' if power else 'off' } " } def start_music_impl ( energetic : bool , loud : bool ) - > dict : """Play some music matching the specified parameters. Args: energetic: Whether the music is energetic or not. loud: Whether the music is loud or not. Returns: A dictionary containing the music settings. """ music_type = "energetic" if energetic else "chill" volume = "loud" if loud else "quiet" return { "music_type" : music_type , "volume" : volume } def dim_lights_impl ( brightness : float ) - > dict : """Dim the lights. Args: brightness: The brightness of the lights, 0.0 is off, 1.0 is full. Returns: A dictionary containing the new brightness setting. """ return { "brightness" : brightness } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ power_disco_ball_impl , start_music_impl , dim_lights_impl ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Do everything you need to this place into party!" , config = config , ) print ( " \n Example 2: Automatic function calling" ) print ( response . text ) # I've turned on the disco ball, started playing loud and \ No newline at end of file diff --git a/docstore/6893ee58-b85e-419e-bc0f-255204dc0645 b/docstore/6893ee58-b85e-419e-bc0f-255204dc0645 new file mode 100644 index 0000000000000000000000000000000000000000..e6f71e62f1d38e5969349ef563bd1d1143e3d3e1 --- /dev/null +++ b/docstore/6893ee58-b85e-419e-bc0f-255204dc0645 @@ -0,0 +1 @@ +shows you how to specify ambiance. Ambiance Prompt Generated output Color palettes play a vital role in photography, influencing the mood and conveying intended emotions. Try things like "muted orange warm tones," "natural light," "sunrise" or "sunset". For example, a warm, golden palette can infuse a romantic and atmospheric feel into a photograph. A close-up of a girl holding adorable golden retriever puppy in the park, sunlight. Cinematic close-up shot of a sad woman riding a bus in the rain, cool blue tones, sad mood. Use reference images to generate videos You can bring images to life by using Veo's image-to-video capability. You can use existing assets, or try Imagen to generate something new. Prompt Generated output Bunny with a chocolate candy bar. Bunny runs away. Negative prompts Negative prompts can be a powerful tool to help specify elements you don't want in the video. Describe what you want to discourage the model from generating after the phrase "Negative prompt". Follow these tips: ❌ Don't use instructive language or words like no or don't . For example, "No walls" or "don't show walls". ✅ Do describe what you don't want to see. For example, "wall, frame", which means that you don't want a wall or a frame in the video. Prompt Generated output Generate a short, stylized animation of a large, solitary oak tree with leaves blowing vigorously in a strong wind. The tree should have a slightly exaggerated, whimsical form, with dynamic, flowing branches. The leaves should display a variety of autumn colors, swirling and dancing in the wind. The animation should use a warm, inviting color palette. Generate a short, stylized animation of a large, solitary oak tree with leaves blowing vigorously in a strong wind. The tree should have a slightly exaggerated, whimsical form, with dynamic, flowing branches. The leaves should display a variety of autumn colors, swirling and dancing in the wind. The animation should use a warm, inviting color palette. With negative \ No newline at end of file diff --git a/docstore/68a6d4d3-7d80-4ca9-872a-2a864387abb6 b/docstore/68a6d4d3-7d80-4ca9-872a-2a864387abb6 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/68a6d4d3-7d80-4ca9-872a-2a864387abb6 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/68bf5ccc-7e1d-4a43-8d34-2f4e85dde2e2 b/docstore/68bf5ccc-7e1d-4a43-8d34-2f4e85dde2e2 new file mode 100644 index 0000000000000000000000000000000000000000..4b5f15989e784aa4b4f5462e86ee08ece0d0f480 --- /dev/null +++ b/docstore/68bf5ccc-7e1d-4a43-8d34-2f4e85dde2e2 @@ -0,0 +1 @@ +pathlib . Path ( 'a11.txt' ) . write_text ( response . text ) my_file = client . files . upload ( file = 'a11.txt' ) response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = [ 'Can you summarize this file:' , my_file ] ) print ( response . text ) List and get List uploaded files and get an uploaded file with a filename: Before Python import google.generativeai as genai for file in genai . list_files (): print ( file . name ) file = genai . get_file ( name = file . name ) After Python from google import genai client = genai . Client () for file in client . files . list (): print ( file . name ) file = client . files . get ( name = file . name ) Delete Delete a file: Before Python import pathlib import google.generativeai as genai pathlib . Path ( 'dummy.txt' ) . write_text ( dummy ) dummy_file = genai . upload_file ( path = 'dummy.txt' ) file = genai . delete_file ( name = dummy_file . name ) After Python import pathlib from google import genai client = genai . Client () pathlib . Path ( 'dummy.txt' ) . write_text ( dummy ) dummy_file = client . files . upload ( file = 'dummy.txt' ) response = client . files . delete ( name = dummy_file . name ) Context caching Context caching allows the user to pass the content to the model once, cache the input tokens, and then refer to the cached tokens in subsequent calls to lower the cost. Before Python import requests import pathlib import google.generativeai as genai from google.generativeai import caching # Download file response = requests . get ( 'https://storage.googleapis.com/generativeai-downloads/data/a11.txt' ) pathlib . Path ( 'a11.txt' ) . write_text ( response . text ) # Upload file document = genai . upload_file ( path = "a11.txt" ) # Create cache apollo_cache = caching . CachedContent . create ( model = "gemini-1.5-flash-001" , system_instruction = "You are an expert at analyzing transcripts." , contents = [ document ], ) # Generate response apollo_model = genai . GenerativeModel . \ No newline at end of file diff --git a/docstore/68d2def6-ddc1-4721-b97c-75b2c76f6860 b/docstore/68d2def6-ddc1-4721-b97c-75b2c76f6860 new file mode 100644 index 0000000000000000000000000000000000000000..1540f812946831d7975c774be202eaa83cf52504 --- /dev/null +++ b/docstore/68d2def6-ddc1-4721-b97c-75b2c76f6860 @@ -0,0 +1 @@ +"CiwBVKhc7nRyTi3HmggPD9iQiRc261f5jwuMdw3H/itDH0emsb9ZVo3Nwx9p6wpsAVSoXO5i8fDV4jBSBLoaWxB5zUdlGY6aIGp+I0oEnwRRSRQ1LOvrDlojEH8JE8HjiKXALdJrvNPiG+HY3GZEO8pZjEZtc3UoBUh7+SVyjK7Xolu7aRYYeUyzrCapoETWypER1jbrJXnFV23hCosBAVSoXO6oIPNJSmbuEDfGafOhuCSHkpr1yjTp35RXYqmCESzRzWf5+nFXLqncqeFo4ohoxbiYQVpVQbOZF81p8o9zg6xeRE7qMeOv+XN7enXGJ4/s3qNFQpfkSMqRdBITN1VpX7jyfEAjvxBNc7PDfDJZmEPY338ZIY5nFFcmzJSWjVrboFt2sMFv+A==" } ] , "role" : "model" } , "finishReason" : "STOP" , "index" : 0 } ] , # Remainder of response... You can confirm that you received a signature and see what a signature looks like using the following code: # Step 2: Call the model with function declarations # ...Generation config, Configure the client, and Define user prompt (No changes) # Send request with declarations (using a thinking model) response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents ) # See thought signatures for part in response . candidates [ 0 ] . content . parts : if part . thought_signature : print ( "Thought signature:" ) print ( part . thought_signature ) Returning signatures back to the server In order to return signatures back: You should return signatures along with their containing parts back to the server You shouldn't merge a part with a signature with another part which also contains a signature. The signature string is not concatenable You shouldn't merge one part with a signature with another part without a signature. This breaks the correct positioning of the thought represented by the signature. The code will remain the same as in Step 4 of the previous section. But in this case (as indicated in the comment below) you will return signatures to the model along with the result of the function execution so the model can incorporate the thoughts into its final response: Python # Step 4: Create user friendly response with function result and call the model again # ...Create a function response part (No change) # Append thought \ No newline at end of file diff --git a/docstore/68e5ee69-7c72-477a-84b0-eb357b53f8da b/docstore/68e5ee69-7c72-477a-84b0-eb357b53f8da new file mode 100644 index 0000000000000000000000000000000000000000..42de1c1e9ac410bff7cfb9b2a0eff4c24496bea9 --- /dev/null +++ b/docstore/68e5ee69-7c72-477a-84b0-eb357b53f8da @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#imagen-3 Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/68e7b6a6-109a-4c90-a2b6-90877f39c4f9 b/docstore/68e7b6a6-109a-4c90-a2b6-90877f39c4f9 new file mode 100644 index 0000000000000000000000000000000000000000..4a8a7222dfc27acfaa73b21a084913914a78851b --- /dev/null +++ b/docstore/68e7b6a6-109a-4c90-a2b6-90877f39c4f9 @@ -0,0 +1 @@ +"fmt" "google.golang.org/genai" "os" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := genai . Text ( "What is the sum of the first 50 prime numbers?" ) model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { IncludeThoughts : true , }, }) for _ , part := range resp . Candidates [ 0 ]. Content . Parts { if part . Text != "" { if part . Thought { fmt . Println ( "Thoughts Summary:" ) fmt . Println ( part . Text ) } else { fmt . Println ( "Answer:" ) fmt . Println ( part . Text ) } } } } And here is an example using thinking with streaming, which returns rolling, incremental summaries during generation: Python from google import genai from google.genai import types client = genai . Client () prompt = """ Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own? """ thoughts = "" answer = "" for chunk in client . models . generate_content_stream ( model = "gemini-2.5-pro" , contents = prompt , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( include_thoughts = True ) ) ): for part in chunk . candidates [ 0 ] . content . parts : if not part . text : continue elif part . thought : if not thoughts : print ( "Thoughts summary:" ) print ( part . text ) thoughts += part . text else : if not answer : print ( "Answer:" ) print ( part . text ) answer += part . text JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const prompt = `Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The \ No newline at end of file diff --git a/docstore/68f26c3b-06ee-4386-9e90-57145afabf23 b/docstore/68f26c3b-06ee-4386-9e90-57145afabf23 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/68f26c3b-06ee-4386-9e90-57145afabf23 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/68fb9c8e-b3ae-43a0-9dbf-26553f5da712 b/docstore/68fb9c8e-b3ae-43a0-9dbf-26553f5da712 new file mode 100644 index 0000000000000000000000000000000000000000..52b1c12d2420a9cadaa2ce2a37360b459f77c624 --- /dev/null +++ b/docstore/68fb9c8e-b3ae-43a0-9dbf-26553f5da712 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/live#audio-to-audio Title: Get started with Live API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/6902fb90-dedd-4f10-a3f8-569587df6805 b/docstore/6902fb90-dedd-4f10-a3f8-569587df6805 new file mode 100644 index 0000000000000000000000000000000000000000..37fa730aa9280f3cac34df0c8f8ecdd2b308e691 --- /dev/null +++ b/docstore/6902fb90-dedd-4f10-a3f8-569587df6805 @@ -0,0 +1 @@ +operation . response . generated_videos ): client . files . download ( file = generated_video . video ) generated_video . video . save ( f "video { n } .mp4" ) # save the video JavaScript import { GoogleGenAI } from "@google/genai" ; import { createWriteStream } from "fs" ; import { Readable } from "stream" ; const ai = new GoogleGenAI ({}); async function main () { let operation = await ai . models . generateVideos ({ model : "veo-2.0-generate-001" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , config : { personGeneration : "dont_allow" , aspectRatio : "16:9" , }, }); while ( ! operation . done ) { await new Promise (( resolve ) = > setTimeout ( resolve , 10000 )); operation = await ai . operations . getVideosOperation ({ operation : operation , }); } operation . response ? . generatedVideos ? . forEach ( async ( generatedVideo , n ) = > { const resp = await fetch ( ` ${ generatedVideo . video ? . uri } & key=GEMINI_API_KEY` ); // append your API key const writer = createWriteStream ( `video ${ n } .mp4` ); Readable . fromWeb ( resp . body ). pipe ( writer ); }); } main (); Go package main import ( "context" "fmt" "os" "time" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } videoConfig := & genai . GenerateVideosConfig { AspectRatio : "16:9" , PersonGeneration : "dont_allow" , } operation , _ := client . Models . GenerateVideos ( ctx , "veo-2.0-generate-001" , "Panning wide shot of a calico kitten sleeping in the sunshine" , nil , videoConfig , ) for ! operation . Done { time . Sleep ( 20 * time . Second ) operation , _ = client . Operations . GetVideosOperation ( ctx , operation , nil ) } for n , video := range operation . Response . GeneratedVideos { client . Files . Download ( ctx , video . Video , nil ) fname := fmt . Sprintf ( "video_%d.mp4" , n ) _ = os . WriteFile ( fname , video . Video . VideoBytes , 0644 ) } } REST # \ No newline at end of file diff --git a/docstore/69067177-64f2-4020-9591-5864164bec67 b/docstore/69067177-64f2-4020-9591-5864164bec67 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/69067177-64f2-4020-9591-5864164bec67 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/6915ee21-ceb2-4930-b821-7ae3b1feff19 b/docstore/6915ee21-ceb2-4930-b821-7ae3b1feff19 new file mode 100644 index 0000000000000000000000000000000000000000..96cef7defc13924a885e51af123ecd669ceba8d6 --- /dev/null +++ b/docstore/6915ee21-ceb2-4930-b821-7ae3b1feff19 @@ -0,0 +1 @@ +YOUR_URL . Also let me know what needs to taken care of considering weather and commute." , config = GenerateContentConfig ( tools = tools , response_modalities = [ "TEXT" ], ) ) for each in response . candidates [ 0 ] . content . parts : print ( each . text ) # get URLs retrieved for context print ( response . candidates [ 0 ] . url_context_metadata ) Javascript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ "Give me three day events schedule based on YOUR_URL . Also let me know what needs to taken care of considering weather and commute." , ], config : { tools : [{ urlContext : {}}, { googleSearch : {}}], }, }); console . log ( response . text ); // To get URLs retrieved for context console . log ( response . candidates [ 0 ]. urlContextMetadata ) } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -d '{ "contents": [ { "parts": [ {"text": "Give me three day events schedule based on YOUR_URL . Also let me know what needs to taken care of considering weather and commute."} ] } ], "tools": [ { "url_context": {} }, { "google_search": {} } ] }' > result.json cat result.json For more details about Grounding with Google Search, see the overview page. Contextual response The model's response will be based on the content it retrieved from the URLs. If the model retrieved content from URLs, the response will include url_context_metadata . Such a response might look something like the following (parts of the response have been omitted for brevity): { "candidates" : [ { "content" : { "parts" : [ { "text" : "... \n" } ], "role" : "model" }, ... "url_context_metadata" : { "url_metadata" : [ { "retrieved_url" : \ No newline at end of file diff --git a/docstore/6937d831-1560-4275-85d6-4f8839184062 b/docstore/6937d831-1560-4275-85d6-4f8839184062 new file mode 100644 index 0000000000000000000000000000000000000000..6753b1d43ae21393f7ea777aad5afc4becddf540 --- /dev/null +++ b/docstore/6937d831-1560-4275-85d6-4f8839184062 @@ -0,0 +1 @@ +response will include a thought_signature field containing an encrypted representation of the model's reasoning. Return the signature: When you send the function's execution result back to the server, include the thought_signature you received. This allows the model to restore its previous thinking context and will likely result in better function calling performance. Receiving signatures from the server Signatures are returned in the part after the model's thinking phase, which typically is a text or function call. Here are some examples of what thought signatures look like returned in each type of part, in response to the request "What's the weather in Lake Tahoe?" using the Get Weather example: Text part [{ "candidates" : [ { "content" : { "parts" : [ { "text" : "Here's what the weather in Lake Tahoe is today" , "thoughtSignature" : "ClcBVKhc7ru7KzUI7SrdUoIdAYLm/+i93aHjfIt4xHyAoO/G70tApxnK2ujBhOhC1PrRy1pkQa88fqFvpHNVd1HDjNLO7mkp6/hFwE+SPPEB3fh0hs4oM8MKhgIBVKhc7uIGvrS7i/T4HpfbnYrluFfWNjZ62gewqe4cVdR/Dlh+zbjtYmDD0gPZ+SuBO7vvHQdzsjePRP+2Y5XddX6LEf/cGGgakq8EhVvw/a6IVzUO6XmpHg2Ag1sl8E9+VFH/lC0R0ZuYdFWligtDuYwp5p5q3o59G0TtWeU2MC1y2MJfE9u/KWd313ldka80/X2W/xF2O/4djMp5G2WKcULfve75zeRCy0mc5iS3SB9mTH0cT6x0vtKjeBx50gcg+CQWtJcRuwTVzz54dmvmK9xvnqA8gKGw3DuaM9wfy5hyY7Qg0z3iyyWdP8T/lbjKim8IEQOk7O1vVwP1Ko7oMYH8JgA1CsoBAVSoXO6v4c5RSyd1cn6EIU0pEFQsjW7rYWPuZdOFq/tsGJT9BCfW7KGkPGwlNSq8jTJFvbcJ/DjtndISQYXwiXd2kGa5JfdS2Kh4zOxCxiWtOk+2nCc3+XQk2nonhO+esGJpkDdbbHZSqRgcUtYKq7q28iPFOQvOFyCiZNB7K86Z/6Hnagu2snSlN/BcTMaFGaWpcCClSUo4foRZn3WbNCoM8rcpD7qEJMp4a5baaSxyyeL1ZTGd2HLpFys/oiW6e3oAnhxuIysCwg==" } ] , "role" : "model" } , "index" : 0 } ] , # Remainder of response... Function call part [{ "candidates" : [ { "content" : { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/695fd380-d01c-411e-a262-766d571ddf01 b/docstore/695fd380-d01c-411e-a262-766d571ddf01 new file mode 100644 index 0000000000000000000000000000000000000000..d49c30fb3aba75c6b3242033ae2d96a3ff44eb23 --- /dev/null +++ b/docstore/695fd380-d01c-411e-a262-766d571ddf01 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/billing#main-content Title: Billing | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/6962581f-04bb-41d2-8773-482681c7b478 b/docstore/6962581f-04bb-41d2-8773-482681c7b478 new file mode 100644 index 0000000000000000000000000000000000000000..63d9f8f1c5d6f296f9334ea760bebfcc6dc4a24c --- /dev/null +++ b/docstore/6962581f-04bb-41d2-8773-482681c7b478 @@ -0,0 +1 @@ +Gemini thinking | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini thinking The Gemini 2.5 series models use an internal "thinking process" that significantly improves their reasoning and multi-step planning abilities, making them highly effective for complex tasks such as coding, advanced mathematics, and data analysis. This guide shows you how to work with Gemini's thinking capabilities using the Gemini API. Before you begin Ensure you use a supported 2.5 series model for thinking. You might find it beneficial to explore these models in AI Studio before diving into the API: Try Gemini 2.5 Flash in AI Studio Try Gemini 2.5 Pro in AI Studio Try Gemini 2.5 Flash-Lite Preview in AI Studio Generating content with thinking Initiating a request with a thinking model is similar to any other content generation request. The key difference lies in specifying one of the models with thinking support in the model field, as demonstrated in the following text generation example: Python from google import genai client = genai . Client () prompt = "Explain the concept of Occam's Razor and provide a simple, everyday example." response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = prompt ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const prompt = "Explain the concept of Occam's Razor and provide a simple, everyday example." ; const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : prompt , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" \ No newline at end of file diff --git a/docstore/6971ae00-ad7b-4dba-a544-af3a4ac43424 b/docstore/6971ae00-ad7b-4dba-a544-af3a4ac43424 new file mode 100644 index 0000000000000000000000000000000000000000..f768002e22e546af8fbd249f6201ab1a1006d078 --- /dev/null +++ b/docstore/6971ae00-ad7b-4dba-a544-af3a4ac43424 @@ -0,0 +1 @@ +const data = response . candidates ? .[ 0 ] ? . content ? . parts ? .[ 0 ] ? . inlineData ? . data ; const audioBuffer = Buffer . from ( data , 'base64' ); const fileName = 'out.wav' ; await saveWaveFile ( fileName , audioBuffer ); } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -X POST \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts":[{ "text": "Say cheerfully: Have a wonderful day!" }] }], "generationConfig": { "responseModalities": ["AUDIO"], "speechConfig": { "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Kore" } } } }, "model": "gemini-2.5-flash-preview-tts", }' | jq -r '.candidates[0].content.parts[0].inlineData.data' | \ base64 --decode >out.pcm # You may need to install ffmpeg. ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm out.wav Multi-speaker text-to-speech For multi-speaker audio, you'll need a MultiSpeakerVoiceConfig object with each speaker (up to 2) configured as a SpeakerVoiceConfig . You'll need to define each speaker with the same names used in the prompt : Python from google import genai from google.genai import types import wave # Set up the wave file to save the output: def wave_file ( filename , pcm , channels = 1 , rate = 24000 , sample_width = 2 ): with wave . open ( filename , "wb" ) as wf : wf . setnchannels ( channels ) wf . setsampwidth ( sample_width ) wf . setframerate ( rate ) wf . writeframes ( pcm ) client = genai . Client () prompt = """TTS the following conversation between Joe and Jane: Joe: How's it going today Jane? Jane: Not too bad, how about you?""" response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = prompt , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( multi_speaker_voice_config = types . MultiSpeakerVoiceConfig ( speaker_voice_configs = [ types . \ No newline at end of file diff --git a/docstore/69720dcc-a253-454f-840d-f41e2dfed750 b/docstore/69720dcc-a253-454f-840d-f41e2dfed750 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/69720dcc-a253-454f-840d-f41e2dfed750 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/697a2a39-5e1f-4a3e-ac0f-d275d93f2f4f b/docstore/697a2a39-5e1f-4a3e-ac0f-d275d93f2f4f new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/697a2a39-5e1f-4a3e-ac0f-d275d93f2f4f @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/6980ece4-95f9-4a9a-948c-07fd9aa865c0 b/docstore/6980ece4-95f9-4a9a-948c-07fd9aa865c0 new file mode 100644 index 0000000000000000000000000000000000000000..0d6f2bf32b918d3f85636fdb1c53070e3d5509f6 --- /dev/null +++ b/docstore/6980ece4-95f9-4a9a-948c-07fd9aa865c0 @@ -0,0 +1 @@ +overlay_filename = f " { item [ 'label' ] } _ { i } _overlay.png" mask . save ( os . path . join ( output_dir , mask_filename )) # Create and save overlay composite = Image . alpha_composite ( im . convert ( 'RGBA' ), overlay ) composite . save ( os . path . join ( output_dir , overlay_filename )) print ( f "Saved mask and overlay for { item [ 'label' ] } to { output_dir } " ) # Example usage if __name__ == "__main__" : extract_segmentation_masks ( "path/to/image.png" ) Check the segmentation example in the cookbook guide for a more detailed example. An example segmentation output with objects and segmentation masks Supported image formats Gemini supports the following image format MIME types: PNG - image/png JPEG - image/jpeg WEBP - image/webp HEIC - image/heic HEIF - image/heif Capabilities All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation. Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements. Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities: Gemini 2.0 models are further trained to support enhanced object detection . Gemini 2.5 models are further trained to support enhanced segmentation in addition to object detection . Limitations and key technical information File limit Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request. Token calculation Gemini 1.5 Flash and Gemini 1.5 Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens. Gemini 2.0 Flash and Gemini 2.5 Flash/Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each \ No newline at end of file diff --git a/docstore/698d49c9-140a-491a-bb14-2f98c3573a7a b/docstore/698d49c9-140a-491a-bb14-2f98c3573a7a new file mode 100644 index 0000000000000000000000000000000000000000..40564cc3a339b41e3f9c5a2f24a7d0082d31abf9 --- /dev/null +++ b/docstore/698d49c9-140a-491a-bb14-2f98c3573a7a @@ -0,0 +1 @@ +response_modalities = [ "AUDIO" ], context_window_compression = ( # Configures compression with default parameters. types . ContextWindowCompressionConfig ( sliding_window = types . SlidingWindow (), ) ), ) JavaScript const config = { responseModalities : [ Modality . AUDIO ], contextWindowCompression : { slidingWindow : {} } }; Session resumption To prevent session termination when the server periodically resets the WebSocket connection, configure the sessionResumption field within the setup configuration . Passing this configuration causes the server to send SessionResumptionUpdate messages, which can be used to resume the session by passing the last resumption token as the SessionResumptionConfig.handle of the subsequent connection. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" async def main (): print ( f "Connecting to the service with handle { previous_session_handle } ..." ) async with client . aio . live . connect ( model = model , config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], session_resumption = types . SessionResumptionConfig ( # The handle of the session to resume is passed here, # or else None to start a new session. handle = previous_session_handle ), ), ) as session : while True : await session . send_client_content ( turns = types . Content ( role = "user" , parts = [ types . Part ( text = "Hello world!" )] ) ) async for message in session . receive (): # Periodically, the server will send update messages that may # contain a handle for the current state of the session. if message . session_resumption_update : update = message . session_resumption_update if update . resumable and update . new_handle : # The handle should be retained and linked to the session. return update . new_handle # For the purposes of this example, placeholder input is continually fed # to the model. In non-sample code, the model inputs would come from # \ No newline at end of file diff --git a/docstore/6993fe76-2b10-491d-81d8-879792f1b842 b/docstore/6993fe76-2b10-491d-81d8-879792f1b842 new file mode 100644 index 0000000000000000000000000000000000000000..f588a79a4036932ab17be217863724b667d93cf6 --- /dev/null +++ b/docstore/6993fe76-2b10-491d-81d8-879792f1b842 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/thinking#tasks Title: Gemini thinking | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/69a0d639-da5a-499a-b684-524ac150e17a b/docstore/69a0d639-da5a-499a-b684-524ac150e17a new file mode 100644 index 0000000000000000000000000000000000000000..6e71e94222e9c44768c28e09ebada72b5ff1e76f --- /dev/null +++ b/docstore/69a0d639-da5a-499a-b684-524ac150e17a @@ -0,0 +1 @@ +writeFileSync ( `imagen- ${ idx } .png` , buffer ); idx ++ ; } } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateImagesConfig { NumberOfImages : 4 , } response , _ := client . Models . GenerateImages ( ctx , "imagen-4.0-generate-preview-06-06" , "Robot holding a red skateboard" , config , ) for n , image := range response . GeneratedImages { fname := fmt . Sprintf ( "imagen-%d.png" , n ) _ = os . WriteFile ( fname , image . Image . ImageBytes , 0644 ) } } REST curl -X POST \ "https://generativelanguage.googleapis.com/v1beta/models/imagen-4.0-generate-preview-06-06:predict" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -d '{ "instances": [ { "prompt": "Robot holding a red skateboard" } ], "parameters": { "sampleCount": 4 } }' AI-generated image of a robot holding a red skateboard Imagen configuration Imagen supports English only prompts at this time and the following parameters: Note: Naming conventions of parameters vary by programming language. numberOfImages : The number of images to generate, from 1 to 4 (inclusive). The default is 4. For Imagen 4 Ultra, it defaults to 1 as only one image can be generated at a time. aspectRatio : Changes the aspect ratio of the generated image. Supported values are "1:1" , "3:4" , "4:3" , "9:16" , and "16:9" . The default is "1:1" . personGeneration : Allow the model to generate images of people. The following values are supported: "dont_allow" : Block generation of images of people. "allow_adult" : Generate images of adults, but not children. This is the default. "allow_all" : Generate images that include adults and children. Note: The "allow_all" parameter value is not allowed in EU, UK, CH, MENA locations. Choosing the right model Choose Gemini when: You need contextually relevant images that leverage \ No newline at end of file diff --git a/docstore/69a12845-3415-40a6-a52a-ec84321b56fb b/docstore/69a12845-3415-40a6-a52a-ec84321b56fb new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/69a12845-3415-40a6-a52a-ec84321b56fb @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/69aec093-75b6-4be0-a121-83b51101b2ec b/docstore/69aec093-75b6-4be0-a121-83b51101b2ec new file mode 100644 index 0000000000000000000000000000000000000000..e7df3841ca47d093238d5ecfeb9f7cc95942f8e5 --- /dev/null +++ b/docstore/69aec093-75b6-4be0-a121-83b51101b2ec @@ -0,0 +1 @@ +data image2Path := "path/to/image2.jpeg" imgBytes , _ := os . ReadFile ( image2Path ) parts := [] * genai . Part { genai . NewPartFromText ( "What is different between these two images?" ), genai . NewPartFromBytes ( imgBytes , "image/jpeg" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST # Upload the first image IMAGE1_PATH = "path/to/image1.jpg" MIME1_TYPE = $( file -b --mime-type " ${ IMAGE1_PATH } " ) NUM1_BYTES = $( wc -c < " ${ IMAGE1_PATH } " ) DISPLAY_NAME1 = IMAGE1 tmp_header_file1 = upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME1_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME1 } '}}" 2 > /dev/null upload_url1 = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file1 } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file1 } " curl " ${ upload_url1 } " \ -H "Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE1_PATH } " 2 > /dev/null > file_info1.json file1_uri = $( jq ".file.uri" file_info1.json ) echo file1_uri = $file1_uri # Prepare the second image (inline) IMAGE2_PATH = "path/to/image2.png" MIME2_TYPE = $( file -b --mime-type " ${ IMAGE2_PATH } " ) if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMAGE2_BASE64 = $( base64 $B64FLAGS $IMAGE2_PATH ) # Now generate content using both images curl \ No newline at end of file diff --git a/docstore/69c41ff8-f7c3-41a7-bbc4-751f2b2ab06b b/docstore/69c41ff8-f7c3-41a7-bbc4-751f2b2ab06b new file mode 100644 index 0000000000000000000000000000000000000000..5ec9ece3755d9588328841497099189b5d8c6b8d --- /dev/null +++ b/docstore/69c41ff8-f7c3-41a7-bbc4-751f2b2ab06b @@ -0,0 +1 @@ +5. Forever Blooms Task Give me a simple list of just the things that I must bring on a camping trip. The list should have 5 items. 1. **Tent:** Your shelter from the elements. 2. **Sleeping Bag:** Stay warm and comfortable. 3. **Sleeping Pad:** Insulation and cushioning for a good night's sleep. 4. **Headlamp or Flashlight:** Light for nighttime activities. 5. **First Aid Kit:** Be prepared for minor injuries. Entity Classify the following items as [large, small]: Elephant Mouse Snail Here's the classification: * **Elephant:** Large * **Mouse:** Small * **Snail:** Small Partial input completion Generative language models work like an advanced auto completion tool. When you provide partial content, the model can provide the rest of the content or what it thinks is a continuation of that content as a response. When doing so, if you include any examples or context, the model can take those examples or context into account. The following example provides a prompt with an instruction and an entity input: Prompt: For the given order, return a JSON object that has the fields cheeseburger, hamburger, fries, or drink, with the value being the quantity. Order: A burger and a drink. Response: { "cheeseburger": 0, "hamburger": 1, "fries": 0, "drink": 1 } (gemini-2.5-flash) While the model did as prompted, writing out the instructions in natural language can sometimes be challenging and it leaves a lot to the model's interpretation. For example, a restaurants menu might contain many items. To reduce the size of the JSON response, you probably want to omit the items that weren't ordered. In this case, you can give an example and a response prefix and let the model complete it: Prompt: Valid fields are cheeseburger, hamburger, fries, and drink. Order: Give me a cheeseburger and fries Output: ``` { "cheeseburger": 1, "fries": 1 } ``` Order: I want two burgers, a drink, and fries. Output: Response: ``` { "hamburger": 2, "drink": 1, "fries": 1 } ``` (gemini-2.5-flash) Notice how \ No newline at end of file diff --git a/docstore/69f44a5a-5dd2-4017-9f5a-6d1a68eee511 b/docstore/69f44a5a-5dd2-4017-9f5a-6d1a68eee511 new file mode 100644 index 0000000000000000000000000000000000000000..8d1e7e8a7a566917639b9515f69a698a37f27a8d --- /dev/null +++ b/docstore/69f44a5a-5dd2-4017-9f5a-6d1a68eee511 @@ -0,0 +1 @@ +to avoid. ⛔ Negative pattern: Example Don't end haikus with a question: Haiku are fun A short and simple poem Don't you enjoy them? ✅ Positive pattern: Example Always end haikus with an assertion: Haiku are fun A short and simple poem A joy to write Consistent formatting Make sure that the structure and formatting of few-shot examples are the same to avoid responses with undesired formats. One of the primary objectives of adding few-shot examples in prompts is to show the model the response format. Therefore, it is essential to ensure a consistent format across all examples, especially paying attention to XML tags, white spaces, newlines, and example splitters. Add context You can include instructions and information in a prompt that the model needs to solve a problem, instead of assuming that the model has all of the required information. This contextual information helps the model understand the constraints and details of what you're asking for it to do. The following example asks the model to give troubleshooting guidance for a router: Prompt: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Response: A yellow blinking light on your Google Wifi router usually indicates a problem with the internet connection. Here are some steps to troubleshoot and fix the issue: 1. Restart your router and modem: - Unplug your modem from the power outlet and wait for 30 seconds. - Plug your modem back in and wait for it to fully boot up. - Unplug your Google Wifi router and wait for 30 seconds. - Plug your Google Wifi router back in and wait for it to fully boot up. ..... (gemini-2.5-flash) The response looks like generic troubleshooting information that's not specific to the router or the status of the LED indicator lights. To customize the response for the specific router, you can add to the prompt the router's troubleshooting guide as context for it to refer to when providing a response. Prompt: Answer the question \ No newline at end of file diff --git a/docstore/6a1662d9-cf5a-4172-bd43-60d13c35fdda b/docstore/6a1662d9-cf5a-4172-bd43-60d13c35fdda new file mode 100644 index 0000000000000000000000000000000000000000..bb8a5c387274cea53762666893b97a549fe37a5c --- /dev/null +++ b/docstore/6a1662d9-cf5a-4172-bd43-60d13c35fdda @@ -0,0 +1 @@ +Client () response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = "Provide a list of 3 famous physicists and their key contributions" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 1024 ) # Turn off thinking: # thinking_config=types.ThinkingConfig(thinking_budget=0) # Turn on dynamic thinking: # thinking_config=types.ThinkingConfig(thinking_budget=-1) ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : "Provide a list of 3 famous physicists and their key contributions" , config : { thinkingConfig : { thinkingBudget : 1024 , // Turn off thinking: // thinkingBudget: 0 // Turn on dynamic thinking: // thinkingBudget: -1 }, }, }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "google.golang.org/genai" "os" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } thinkingBudgetVal := int32 ( 1024 ) contents := genai . Text ( "Provide a list of 3 famous physicists and their key contributions" ) model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : & thinkingBudgetVal , // Turn off thinking: // ThinkingBudget: int32(0), // Turn on dynamic thinking: // ThinkingBudget: int32(-1), }, }) fmt . Println ( resp . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Provide a list of 3 famous physicists and their key contributions" } ] } ], "generationConfig": { \ No newline at end of file diff --git a/docstore/6a2dcd08-c60d-45ed-bfe1-5a055fa0e0e3 b/docstore/6a2dcd08-c60d-45ed-bfe1-5a055fa0e0e3 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/6a2dcd08-c60d-45ed-bfe1-5a055fa0e0e3 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/6a63dc5b-a145-4294-bcfe-63f5350ff53f b/docstore/6a63dc5b-a145-4294-bcfe-63f5350ff53f new file mode 100644 index 0000000000000000000000000000000000000000..a3f65441166ab1d39d6f723c5ce4c04b54ac95e6 --- /dev/null +++ b/docstore/6a63dc5b-a145-4294-bcfe-63f5350ff53f @@ -0,0 +1 @@ +values, use an enum. Tool Selection: While the model can use an arbitrary number of tools, providing too many can increase the risk of selecting an incorrect or suboptimal tool. For best results, aim to provide only the relevant tools for the context or task, ideally keeping the active set to a maximum of 10-20. Consider dynamic tool selection based on conversation context if you have a large total number of tools. Prompt Engineering: Provide context: Tell the model its role (e.g., "You are a helpful weather assistant."). Give instructions: Specify how and when to use functions (e.g., "Don't guess dates; always use a future date for forecasts."). Encourage clarification: Instruct the model to ask clarifying questions if needed. Temperature: Use a low temperature (e.g., 0) for more deterministic and reliable function calls. Validation: If a function call has significant consequences (e.g., placing an order), validate the call with the user before executing it. Error Handling : Implement robust error handling in your functions to gracefully handle unexpected inputs or API failures. Return informative error messages that the model can use to generate helpful responses to the user. Security: Be mindful of security when calling external APIs. Use appropriate authentication and authorization mechanisms. Avoid exposing sensitive data in function calls. Token Limits: Function descriptions and parameters count towards your input token limit. If you're hitting token limits, consider limiting the number of functions or the length of the descriptions, break down complex tasks into smaller, more focused function sets. Notes and limitations Only a subset of the OpenAPI schema is supported. Supported parameter types in Python are limited. Automatic function calling is a Python SDK feature only. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License \ No newline at end of file diff --git a/docstore/6a7c65a3-e3fa-4ec1-9f29-8ca1820d00f9 b/docstore/6a7c65a3-e3fa-4ec1-9f29-8ca1820d00f9 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/6a7c65a3-e3fa-4ec1-9f29-8ca1820d00f9 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/6a8cbf0f-b2d4-4c86-a1e5-a509d31fe56f b/docstore/6a8cbf0f-b2d4-4c86-a1e5-a509d31fe56f new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/6a8cbf0f-b2d4-4c86-a1e5-a509d31fe56f @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/6aa6f586-c204-4d99-a18f-bbf1aeff20b7 b/docstore/6aa6f586-c204-4d99-a18f-bbf1aeff20b7 new file mode 100644 index 0000000000000000000000000000000000000000..9b0480f81615786d4da1611ac72ab96b5af43602 --- /dev/null +++ b/docstore/6aa6f586-c204-4d99-a18f-bbf1aeff20b7 @@ -0,0 +1 @@ +"CiIBVKhc7oDPpCaXyJKKssjqr4g3JNOSgJ/M2V+1THC1icsWCmwBVKhc7pBABbZ+zR3e9234WnWWS6GFXmf8IVwpnzjd5KYd7vyJbn/4vTorWBGayj/vbd9JPaZQjxdAIXhoE5mX/MDsQ7M9N/b0qJjHm39tYIBvS4sIWkMDHqTJqXGLzhhKtrTkfbV3RbaJEkQKmwEBVKhc7qVUgC3hfTXZLo9R3AJzUUIx50NKvJTb9B+UU+LBqgg7Nck1x5OpjWVS2R+SsveprIuYOruk2Y0H53J2OJF8qsxTdIq2si8DGW2V7WK8xyoJH5kbqd7drIw1jLb44b6lx4SMyB0VaULuTBki4d+Ljjg1tJTwR0IYMKqDLDZt9mheINsi0ZxcNjfpnDydRXdWbcSwzmK/wgqJAQFUqFzuKgNVElxs3cbO+xebr2IwcOro84nKTisi0tTp9bICPC9fTUhn3L+rvQWA+d3J1Za8at2bakrqiRj7BTh+CVO9fWQMAEQAs3ni0Z2hfaYG92tOD26E4IoZwyYEoWbfNudpH1fr5tEkyqnEGtWIh7H+XoZQ2DXeiOa+br7Zk88SrNE+trJMCogBAVSoXO5e9fBLg7hnbkmKsrzNLnQtLsQm1gNzjcjEC7nJYklYPp0KI2uGBE1PkM8XNsfllAfHVn7LzHcHNlbQ9pJ7QZTSIeG42goS971r5wNZwxaXwCTphClQh826eqJWo6A/28TtAVQWLhTx5ekbP7qb4nh1UblESZ1saxDQAEo4OKPbDzx5BgqKAQFUqFzuVyjNm5i0wN8hTDnKjfpDroEpPPTs531iFy9BOX+xDCdGHy8D+osFpaoBq6TFekQQbz4hIoUR1YEcP4zI80/cNimEeb9IcFxZTTxiNrbhbbcv0969DSMWhB+ZEqIz4vuw4GLe/xcUvqhlChQwFdgIbdOQHSHpatn5uDlktnP/bi26nKuXIwo0AVSoXO7US22OUH7d1f4abNPI0IyAvhqkPp12rbtWLx9vkOtojE8IP+xCfYtIFuZIzRNZqA==" } ] , "role" : "model" } , { "role" : "user" , "parts" : [ { "functionResponse" : { "name" : "getWeather" , "response" : { "response" : { "stringValue" : "Sunny and hot. 90 degrees Fahrenheit" } } } } ] } ] , # Remainder of request... Learn more about limitations and usage of thought signatures, and about thinking models in general, on the Thinking page. Parallel function calling In addition to single turn function calling, you can also call multiple functions at once. Parallel function calling lets you execute multiple functions at once and is used when the functions are not dependent on each other. This is useful in scenarios like gathering data from multiple independent sources, such as retrieving customer details from different databases or checking inventory levels across various warehouses or performing multiple actions such as converting your apartment into a disco. Python power_disco_ball = { "name" : "power_disco_ball" , "description" \ No newline at end of file diff --git a/docstore/6aab860b-0bd8-41a6-9256-62559be358b0 b/docstore/6aab860b-0bd8-41a6-9256-62559be358b0 new file mode 100644 index 0000000000000000000000000000000000000000..c1222b1eb00e14a7d2a482f186a5d8fda014fef3 --- /dev/null +++ b/docstore/6aab860b-0bd8-41a6-9256-62559be358b0 @@ -0,0 +1 @@ +person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own?` ; let thoughts = "" ; let answer = "" ; async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-pro" , contents : prompt , config : { thinkingConfig : { includeThoughts : true , }, }, }); for await ( const chunk of response ) { for ( const part of chunk . candidates [ 0 ]. content . parts ) { if ( ! part . text ) { continue ; } else if ( part . thought ) { if ( ! thoughts ) { console . log ( "Thoughts summary:" ); } console . log ( part . text ); thoughts = thoughts + part . text ; } else { if ( ! answer ) { console . log ( "Answer:" ); } console . log ( part . text ); answer = answer + part . text ; } } } } await main (); Go package main import ( "context" "fmt" "log" "os" "google.golang.org/genai" ) const prompt = ` Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own? ` func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := genai . Text ( prompt ) model := "gemini-2.5-pro" resp := client . Models . GenerateContentStream ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { IncludeThoughts : true , }, }) for chunk := range resp { for _ , part := range chunk . Candidates [ 0 ]. Content . Parts { if len ( part . Text ) == 0 { continue } if part . Thought { fmt . Printf ( "Thought: %s\n" , part . Text ) } else { fmt . Printf ( "Answer: %s\n" , part . Text ) } } } } Thought signatures \ No newline at end of file diff --git a/docstore/6ab5355b-d269-4017-8ce0-8ee3d2dc2c11 b/docstore/6ab5355b-d269-4017-8ce0-8ee3d2dc2c11 new file mode 100644 index 0000000000000000000000000000000000000000..ed2abe436f910cdcd535902f28a2dda403d40cb9 --- /dev/null +++ b/docstore/6ab5355b-d269-4017-8ce0-8ee3d2dc2c11 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models#main-content Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/6ab5ab34-92ee-43e2-b4c3-a13ccd546cfe b/docstore/6ab5ab34-92ee-43e2-b4c3-a13ccd546cfe new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/6ab5ab34-92ee-43e2-b4c3-a13ccd546cfe @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/6acd2ef8-eaf4-4e72-b6d8-4a79d51ba3a5 b/docstore/6acd2ef8-eaf4-4e72-b6d8-4a79d51ba3a5 new file mode 100644 index 0000000000000000000000000000000000000000..82539837fcb7adc353717f66580809eb160e30f0 --- /dev/null +++ b/docstore/6acd2ef8-eaf4-4e72-b6d8-4a79d51ba3a5 @@ -0,0 +1 @@ +Google stock price?" , tools = 'google_search_retrieval' ) After Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'What is the Google stock price?' , config = types . GenerateContentConfig ( tools = [ types . Tool ( google_search = types . GoogleSearch () ) ] ) ) JSON response Generate answers in JSON format. Before Python By specifying a response_schema and setting response_mime_type="application/json" users can constrain the model to produce a JSON response following a given structure. import google.generativeai as genai import typing_extensions as typing class CountryInfo ( typing . TypedDict ): name : str population : int capital : str continent : str major_cities : list [ str ] gdp : int official_language : str total_area_sq_mi : int model = genai . GenerativeModel ( model_name = "gemini-1.5-flash" ) result = model . generate_content ( "Give me information of the United States" , generation_config = genai . GenerationConfig ( response_mime_type = "application/json" , response_schema = CountryInfo ), ) JavaScript import { GoogleGenerativeAI , SchemaType } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const schema = { description : "List of recipes" , type : SchemaType . ARRAY , items : { type : SchemaType . OBJECT , properties : { recipeName : { type : SchemaType . STRING , description : "Name of the recipe" , nullable : false , }, }, required : [ "recipeName" ], }, }; const model = genAI . getGenerativeModel ({ model : "gemini-1.5-pro" , generationConfig : { responseMimeType : "application/json" , responseSchema : schema , }, }); const result = await model . generateContent ( "List a few popular cookie recipes." , ); console . log ( result . response . text ()); After Python The new SDK uses pydantic classes to provide the schema (although you can pass a genai.types.Schema , or equivalent \ No newline at end of file diff --git a/docstore/6aef9261-a4ca-4033-a47d-12d4e173151c b/docstore/6aef9261-a4ca-4033-a47d-12d4e173151c new file mode 100644 index 0000000000000000000000000000000000000000..5517c0bf6b0252fb1080acbdd22b2b409d2663d8 --- /dev/null +++ b/docstore/6aef9261-a4ca-4033-a47d-12d4e173151c @@ -0,0 +1 @@ +signatures, function call and result of the function execution to contents function_call_content = response . candidates [ 0 ] . content # Append the model's function call message, which includes thought signatures contents . append ( function_call_content ) contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Step 4: Create user friendly response with function result and call the model again // ...Create a function response part (No change) // Append thought signatures, function call and result of the function execution to contents const function_response_content = response . candidates [ 0 ]. content ; contents . push ( function_response_content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); The following shows what a request returning a thought signature may look like: [{ "contents" : [ { "role" : "user" , "parts" : [ { "text" : "what is the weather in Lake Tahoe?" } ] } , { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/6b1c9fe3-c8b3-44ff-b401-0683bf0473e4 b/docstore/6b1c9fe3-c8b3-44ff-b401-0683bf0473e4 new file mode 100644 index 0000000000000000000000000000000000000000..4c17b2856d11bdc3b188a5b9a3f56ab6d4b47404 --- /dev/null +++ b/docstore/6b1c9fe3-c8b3-44ff-b401-0683bf0473e4 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#imagen-4 Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/6b3735b5-3307-47d5-b8a1-fa8fa02da612 b/docstore/6b3735b5-3307-47d5-b8a1-fa8fa02da612 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/6b3735b5-3307-47d5-b8a1-fa8fa02da612 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/6b496e2c-4de8-43dd-be1a-006284fdbd93 b/docstore/6b496e2c-4de8-43dd-be1a-006284fdbd93 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/6b496e2c-4de8-43dd-be1a-006284fdbd93 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/6b5d71dd-8d5e-4f31-8984-adcc05198610 b/docstore/6b5d71dd-8d5e-4f31-8984-adcc05198610 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/6b5d71dd-8d5e-4f31-8984-adcc05198610 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/6b6f32e5-500b-473f-a306-10558750d451 b/docstore/6b6f32e5-500b-473f-a306-10558750d451 new file mode 100644 index 0000000000000000000000000000000000000000..1c3c1b9b46e1c38e34dd8cd82807f79c808d7249 --- /dev/null +++ b/docstore/6b6f32e5-500b-473f-a306-10558750d451 @@ -0,0 +1 @@ +sketches, to hyper-realistic digital art. For example, the following images use the same prompt with different styles: "An [art style or creation technique] of an angular sporty electric sedan with skyscrapers in the background" Prompt: A technical pencil drawing of an angular... Prompt: A charcoal drawing of an angular... Prompt: A color pencil drawing of an angular... Prompt: A pastel painting of an angular... Prompt: A digital art of an angular... Prompt: An art deco (poster) of an angular... Image source: Each image was generated using its corresponding text prompt with the Imagen 2 model. Shapes and materials Prompt includes: "...made of..." , "...in the shape of..." One of the strengths of this technology is that you can create imagery that is otherwise difficult or impossible. For example, you can recreate your company logo in different materials and textures. Prompt: a duffle bag made of cheese Prompt: neon tubes in the shape of a bird Prompt: an armchair made of paper , studio photo, origami style Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Historical art references Prompt includes: "...in the style of..." Certain styles have become iconic over the years. The following are some ideas of historical painting or art styles that you can try. "generate an image in the style of [art period or movement] : a wind farm" Prompt: generate an image in the style of an impressionist painting : a wind farm Prompt: generate an image in the style of a renaissance painting : a wind farm Prompt: generate an image in the style of pop art : a wind farm Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Image quality modifiers Certain keywords can let the model know that you're looking for a high-quality asset. Examples of quality modifiers include the following: General Modifiers - high-quality, beautiful, stylized Photos - 4K, HDR, Studio Photo Art, Illustration - by a \ No newline at end of file diff --git a/docstore/6b7400e4-4846-409f-a6dd-d3ee85ddd274 b/docstore/6b7400e4-4846-409f-a6dd-d3ee85ddd274 new file mode 100644 index 0000000000000000000000000000000000000000..6479a4b50897c899a1b9742e0d69348c2776f1d5 --- /dev/null +++ b/docstore/6b7400e4-4846-409f-a6dd-d3ee85ddd274 @@ -0,0 +1 @@ +config = types . GenerateImagesConfig ( aspect_ratio = "16:9" , number_of_images = 1 ) ) imagen . generated_images [ 0 ] . image JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const response = await ai . models . generateImages ({ model : "imagen-3.0-generate-002" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , config : { numberOfImages : 1 , }, }); // you'll pass response.generatedImages[0].image.imageBytes to Veo Go package main import ( "context" "fmt" "os" "time" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateImagesConfig { AspectRatio : "16:9" , NumberOfImages : 1 , } response , _ := client . Models . GenerateImages ( ctx , "imagen-3.0-generate-002" , "Panning wide shot of a calico kitten sleeping in the sunshine" , config , ) // you'll pass response.GeneratedImages[0].Image to Veo } Then, generate a video using the resulting image as the first frame: Python operation = client . models . generate_videos ( model = "veo-2.0-generate-001" , prompt = prompt , image = imagen . generated_images [ 0 ] . image , config = types . GenerateVideosConfig ( person_generation = "dont_allow" , # "dont_allow" or "allow_adult" aspect_ratio = "16:9" , # "16:9" or "9:16" number_of_videos = 2 ), ) # Wait for videos to generate while not operation . done : time . sleep ( 20 ) operation = client . operations . get ( operation ) for n , video in enumerate ( operation . response . generated_videos ): fname = f 'with_image_input { n } .mp4' print ( fname ) client . files . download ( file = video . video ) video . video . save ( fname ) JavaScript import { GoogleGenAI } from "@google/genai" ; import { createWriteStream } from "fs" ; import { Readable } from "stream" ; const ai = new GoogleGenAI ({}); async function main () { // get image bytes from Imagen, as shown above let \ No newline at end of file diff --git a/docstore/6b7b92ba-6ec8-47ee-a8df-5a7bfa1de01f b/docstore/6b7b92ba-6ec8-47ee-a8df-5a7bfa1de01f new file mode 100644 index 0000000000000000000000000000000000000000..c60a398b68d2fb158c62411b9f70b1da071d4fb4 --- /dev/null +++ b/docstore/6b7b92ba-6ec8-47ee-a8df-5a7bfa1de01f @@ -0,0 +1 @@ +response back to the model in a subsequent request, continuing this loop until no more tool calls are made by the model. Here, you can find an example of how to use a local MCP server with Gemini and mcp SDK. Python Make sure the latest version of the mcp SDK is installed on your platform of choice. pip install mcp Note: Python supports automatic tool calling by passing in the ClientSession into the tools parameters. If you want to disable it, you can provide automatic_function_calling with disabled True . import os import asyncio from datetime import datetime from mcp import ClientSession , StdioServerParameters from mcp.client.stdio import stdio_client from google import genai client = genai . Client () # Create server parameters for stdio connection server_params = StdioServerParameters ( command = "npx" , # Executable args = [ "-y" , "@philschmid/weather-mcp" ], # MCP Server env = None , # Optional environment variables ) async def run (): async with stdio_client ( server_params ) as ( read , write ): async with ClientSession ( read , write ) as session : # Prompt to get the weather for the current day in London. prompt = f "What is the weather in London in { datetime . now () . strftime ( '%Y-%m- %d ' ) } ?" # Initialize the connection between client and server await session . initialize () # Send request to the model with MCP function declarations response = await client . aio . models . generate_content ( model = "gemini-2.5-flash" , contents = prompt , config = genai . types . GenerateContentConfig ( temperature = 0 , tools = [ session ], # uses the session, will automatically call the tool # Uncomment if you **don't** want the SDK to automatically call the tool # automatic_function_calling=genai.types.AutomaticFunctionCallingConfig( # disable=True # ), ), ) print ( response . text ) # Start the asyncio event loop and run the main function asyncio . run ( run ()) JavaScript Make sure the latest version of the mcp SDK is installed on your platform of choice. \ No newline at end of file diff --git a/docstore/6b9ddf30-1bac-4db3-94a6-49906479173e b/docstore/6b9ddf30-1bac-4db3-94a6-49906479173e new file mode 100644 index 0000000000000000000000000000000000000000..2c0be12e5de5e086f593ed916f2fdecc774a45a4 --- /dev/null +++ b/docstore/6b9ddf30-1bac-4db3-94a6-49906479173e @@ -0,0 +1 @@ +Text generation | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Text generation The Gemini API can generate text output from various inputs, including text, images, video, and audio, leveraging Gemini models. Here's a basic example that takes a single text input: Python from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents \ No newline at end of file diff --git a/docstore/6bd36d94-cbcf-4469-bdb0-a2d7b6ebc337 b/docstore/6bd36d94-cbcf-4469-bdb0-a2d7b6ebc337 new file mode 100644 index 0000000000000000000000000000000000000000..6adba52776c85866a9f7e3e3060c0512ed2f447c --- /dev/null +++ b/docstore/6bd36d94-cbcf-4469-bdb0-a2d7b6ebc337 @@ -0,0 +1 @@ +about image prompting, see the Imagen prompt guide To learn about video prompting, see the Veo prompt guide Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-04-28 UTC. \ No newline at end of file diff --git a/docstore/6bf0e44f-15e9-4860-8f6c-bef8ec10e66a b/docstore/6bf0e44f-15e9-4860-8f6c-bef8ec10e66a new file mode 100644 index 0000000000000000000000000000000000000000..67d10e7862bf74e2ae567ff38c6cc00fda524c2e --- /dev/null +++ b/docstore/6bf0e44f-15e9-4860-8f6c-bef8ec10e66a @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/function-calling/tutorial#step-4 Title: Function calling with the Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/6c0a2ebf-7e4c-4332-9dce-e89c76162713 b/docstore/6c0a2ebf-7e4c-4332-9dce-e89c76162713 new file mode 100644 index 0000000000000000000000000000000000000000..10c56dda4e771cbe191acbd7eaea4d6ff44484f5 --- /dev/null +++ b/docstore/6c0a2ebf-7e4c-4332-9dce-e89c76162713 @@ -0,0 +1 @@ +which you can get in Google AI Studio . base_url="https://generativelanguage.googleapis.com/v1beta/openai/" : This tells the OpenAI library to send requests to the Gemini API endpoint instead of the default URL. model="gemini-2.0-flash" : Choose a compatible Gemini model Thinking Gemini 2.5 models are trained to think through complex problems, leading to significantly improved reasoning. The Gemini API comes with a "thinking budget" parameter which gives fine grain control over how much the model will think. Unlike the Gemini API, the OpenAI API offers three levels of thinking control: "low" , "medium" , and "high" , which map to 1,024, 8,192, and 24,576 tokens, respectively. If you want to disable thinking, you can set reasoning_effort to "none" (note that reasoning cannot be turned off for 2.5 Pro models). Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.5-flash" , reasoning_effort = "low" , messages = [ { "role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Explain to me how AI works" } ] ) print ( response . choices [ 0 ] . message ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); const response = await openai . chat . completions . create ({ model : "gemini-2.5-flash" , reasoning_effort : "low" , messages : [ { role : "system" , content : "You are a helpful assistant." }, { role : "user" , content : "Explain to me how AI works" , }, ], }); console . log ( response . choices [ 0 ]. message ); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.5-flash", "reasoning_effort": "low", \ No newline at end of file diff --git a/docstore/6c102731-6b13-4193-ad43-2a8f60f7beb7 b/docstore/6c102731-6b13-4193-ad43-2a8f60f7beb7 new file mode 100644 index 0000000000000000000000000000000000000000..25e7f0a11814fd13ef8a765ba25aac7ae3b4b3ce --- /dev/null +++ b/docstore/6c102731-6b13-4193-ad43-2a8f60f7beb7 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/long-context Title: Long context | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/6c19ca62-90a4-4a1b-82aa-d7f3a9f401a6 b/docstore/6c19ca62-90a4-4a1b-82aa-d7f3a9f401a6 new file mode 100644 index 0000000000000000000000000000000000000000..03e316e434a13c2a6804dc8cff96f196f07c7e52 --- /dev/null +++ b/docstore/6c19ca62-90a4-4a1b-82aa-d7f3a9f401a6 @@ -0,0 +1 @@ +the sum of the first 50 prime numbers. Here's how I'll approach this: 1. **Generate Prime Numbers:** I'll use an iterative method to find prime numbers. I'll start with 2 and check if each subsequent number is divisible by any number between 2 and its square root. If not, it's a prime. 2. **Store Primes:** I'll store the prime numbers in a list until I have 50 of them. 3. **Calculate the Sum:** Finally, I'll sum the prime numbers in the list. Here's the Python code to do this: def is_prime(n): """Efficiently checks if a number is prime.""" if n <= 1: return False if n <= 3: return True if n % 2 == 0 or n % 3 == 0: return False i = 5 while i * i <= n: if n % i == 0 or n % (i + 2) == 0: return False i += 6 return True primes = [] num = 2 while len(primes) < 50: if is_prime(num): primes.append(num) num += 1 sum_of_primes = sum(primes) print(f'{primes=}') print(f'{sum_of_primes=}') primes=[2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229] sum_of_primes=5117 The sum of the first 50 prime numbers is 5117. This output combines several content parts that the model returns when using code execution: text : Inline text generated by the model executableCode : Code generated by the model that is meant to be executed codeExecutionResult : Result of the executable code The naming conventions for these parts vary by programming language. Use code execution in chat You can also use code execution as part of a chat. Python from google import genai from google.genai import types client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" , config = types . GenerateContentConfig ( tools = [ types . Tool ( code_execution = types . ToolCodeExecution )] ), ) response = chat . send_message ( "I have a math question for you." ) print ( response . text ) response = chat . send_message ( "What is \ No newline at end of file diff --git a/docstore/6c20d223-1398-4f0f-82cd-c1c843343e75 b/docstore/6c20d223-1398-4f0f-82cd-c1c843343e75 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/6c20d223-1398-4f0f-82cd-c1c843343e75 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/6c33ff0e-55a8-4323-9c01-13d9b0e252c6 b/docstore/6c33ff0e-55a8-4323-9c01-13d9b0e252c6 new file mode 100644 index 0000000000000000000000000000000000000000..4d5e24b23445eada240041ce046d4864e5df3992 --- /dev/null +++ b/docstore/6c33ff0e-55a8-4323-9c01-13d9b0e252c6 @@ -0,0 +1 @@ +anything with a lower probability is allowed. Threshold (Google AI Studio) Threshold (API) Description Block none BLOCK_NONE Always show regardless of probability of unsafe content Block few BLOCK_ONLY_HIGH Block when high probability of unsafe content Block some BLOCK_MEDIUM_AND_ABOVE Block when medium or high probability of unsafe content Block most BLOCK_LOW_AND_ABOVE Block when low, medium or high probability of unsafe content N/A HARM_BLOCK_THRESHOLD_UNSPECIFIED Threshold is unspecified, block using default threshold If the threshold is not set, the default block threshold is Block none (for gemini-1.5-pro-002 and gemini-1.5-flash-002 and all newer stable GA models) or Block some (in all other models) for all categories except the Civic integrity category. The default block threshold for the Civic integrity category is Block none (for gemini-2.0-flash-001 aliased as gemini-2.0-flash , gemini-2.0-pro-exp-02-05 , and gemini-2.0-flash-lite ) both for Google AI Studio and the Gemini API, and Block most for all other models in Google AI Studio only. You can set these settings for each request you make to the generative service. See the HarmBlockThreshold API reference for details. Safety feedback generateContent returns a GenerateContentResponse which includes safety feedback. Prompt feedback is included in promptFeedback . If promptFeedback.blockReason is set, then the content of the prompt was blocked. Response candidate feedback is included in Candidate.finishReason and Candidate.safetyRatings . If response content was blocked and the finishReason was SAFETY , you can inspect safetyRatings for more details. The content that was blocked is not returned. Adjust safety settings This section covers how to adjust the safety settings in both Google AI Studio and in your code. Google AI Studio You can adjust safety settings in Google AI Studio, but you cannot turn them off. Click Edit safety settings in the Run settings panel to open the Run safety settings modal. In \ No newline at end of file diff --git a/docstore/6c4676ea-1710-4e3c-8b43-1368b4328c0c b/docstore/6c4676ea-1710-4e3c-8b43-1368b4328c0c new file mode 100644 index 0000000000000000000000000000000000000000..ffa55cd17dc266b0e00c821779e2850dd473d215 --- /dev/null +++ b/docstore/6c4676ea-1710-4e3c-8b43-1368b4328c0c @@ -0,0 +1 @@ +"Error: { batch_job . error } " ) Retrieving results Once the job status indicates your batch job has succeeded, the results are available in the response field. Python import json # Use the name of the job you want to check # e.g., inline_batch_job.name from the previous step job_name = "YOUR_BATCH_JOB_NAME" batch_job = client . batches . get ( name = job_name ) if batch_job . state . name == 'JOB_STATE_SUCCEEDED' : # If batch job was created with a file if batch_job . dest and batch_job . dest . file_name : # Results are in a file result_file_name = batch_job . dest . file_name print ( f "Results are in file: { result_file_name } " ) print ( "Downloading result file content..." ) file_content = client . files . download ( file = result_file_name ) # Process file_content (bytes) as needed print ( file_content . decode ( 'utf-8' )) # If batch job was created with inline request elif batch_job . dest and batch_job . dest . inlined_responses : # Results are inline print ( "Results are inline:" ) for i , inline_response in enumerate ( batch_job . dest . inlined_responses ): print ( f "Response { i + 1 } :" ) if inline_response . response : # Accessing response, structure may vary. try : print ( inline_response . response . text ) except AttributeError : print ( inline_response . response ) # Fallback elif inline_response . error : print ( f "Error: { inline_response . error } " ) else : print ( "No results found (neither file nor inline)." ) else : print ( f "Job did not succeed. Final state: { batch_job . state . name } " ) if batch_job . error : print ( f "Error: { batch_job . error } " ) REST BATCH_NAME = "batches/123456" # Your batch job name curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type:application/json" 2 > /dev/null > batch_status.json if jq -r '.done' batch_status.json | grep -q "false" ; then echo "Batch has not finished processing" fi batch_state = $( jq -r '.metadata.state' \ No newline at end of file diff --git a/docstore/6c56fc20-f834-4e0f-9a88-126ff0e58c03 b/docstore/6c56fc20-f834-4e0f-9a88-126ff0e58c03 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/6c56fc20-f834-4e0f-9a88-126ff0e58c03 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/6c5b334e-9392-4eb5-a1fc-5e5bab76eda3 b/docstore/6c5b334e-9392-4eb5-a1fc-5e5bab76eda3 new file mode 100644 index 0000000000000000000000000000000000000000..5517c0bf6b0252fb1080acbdd22b2b409d2663d8 --- /dev/null +++ b/docstore/6c5b334e-9392-4eb5-a1fc-5e5bab76eda3 @@ -0,0 +1 @@ +signatures, function call and result of the function execution to contents function_call_content = response . candidates [ 0 ] . content # Append the model's function call message, which includes thought signatures contents . append ( function_call_content ) contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Step 4: Create user friendly response with function result and call the model again // ...Create a function response part (No change) // Append thought signatures, function call and result of the function execution to contents const function_response_content = response . candidates [ 0 ]. content ; contents . push ( function_response_content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); The following shows what a request returning a thought signature may look like: [{ "contents" : [ { "role" : "user" , "parts" : [ { "text" : "what is the weather in Lake Tahoe?" } ] } , { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/6ca1dc03-3cb3-43b4-8f8d-8f59acddd571 b/docstore/6ca1dc03-3cb3-43b4-8f8d-8f59acddd571 new file mode 100644 index 0000000000000000000000000000000000000000..fb9d4ecb9861e693d44b1b3683d42182f7d40f2b --- /dev/null +++ b/docstore/6ca1dc03-3cb3-43b4-8f8d-8f59acddd571 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/structured-output#schemas-in-python Title: Structured output | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/6cadf661-e8cb-4e64-94e0-ecb52461411e b/docstore/6cadf661-e8cb-4e64-94e0-ecb52461411e new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/6cadf661-e8cb-4e64-94e0-ecb52461411e @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/6cc9e804-4e90-44c1-8bc6-41f5d99facce b/docstore/6cc9e804-4e90-44c1-8bc6-41f5d99facce new file mode 100644 index 0000000000000000000000000000000000000000..2dd19cc1c2d77863f4a301d62456871308dd74fd --- /dev/null +++ b/docstore/6cc9e804-4e90-44c1-8bc6-41f5d99facce @@ -0,0 +1 @@ +"temperature" ], }, }, ], }, ]; // Prompt for the model let contents = [ { role : "user" , parts : [ { text : "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , }, ], }, ]; // Loop until the model has no more function calls to make while ( true ) { const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents , config : { tools }, }); if ( result . functionCalls && result . functionCalls . length > 0 ) { const functionCall = result . functionCalls [ 0 ]; const { name , args } = functionCall ; if ( ! toolFunctions [ name ]) { throw new Error ( `Unknown function call: ${ name } ` ); } // Call the function and get the response. const toolResponse = toolFunctions [ name ]( args ); const functionResponsePart = { name : functionCall . name , response : { result : toolResponse , }, }; // Send the function response back to the model. contents . push ({ role : "model" , parts : [ { functionCall : functionCall , }, ], }); contents . push ({ role : "user" , parts : [ { functionResponse : functionResponsePart , }, ], }); } else { // No more function calls, break the loop. console . log ( result . text ); break ; } } Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . It 's 25°C in London, so I' ve set the thermostat to 20 ° C . Compositional function calling is a native Live API feature. This means Live API can handle the function calling similar to the Python SDK. Python # Light control schemas turn_on_the_lights_schema = { 'name' : 'turn_on_the_lights' } \ No newline at end of file diff --git a/docstore/6cd1b88a-c06b-4235-b8a0-ac9951337457 b/docstore/6cd1b88a-c06b-4235-b8a0-ac9951337457 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/6cd1b88a-c06b-4235-b8a0-ac9951337457 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/6cd48410-2f73-4a3a-acbd-5cc309c0ce65 b/docstore/6cd48410-2f73-4a3a-acbd-5cc309c0ce65 new file mode 100644 index 0000000000000000000000000000000000000000..12eaa70edcc32ba40cc62f56233fa0adea94b474 --- /dev/null +++ b/docstore/6cd48410-2f73-4a3a-acbd-5cc309c0ce65 @@ -0,0 +1 @@ +Gemini API libraries | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini API libraries When building with the Gemini API, we recommend using our official collection of libraries across major languages: the Google GenAI SDK . They are production ready under General Availability . Our samples and documentation across this site are built using these libraries. Note: If you're using one of our legacy libraries, we strongly recommend you migrate to the Google GenAI SDK. Review the legacy libraries section for more information. If you're new to the Gemini API, follow our quickstart guide to get started. Language support and installation The Google GenAI SDK is available for the Python, JavaScript/TypeScript, Go and Java languages. You can install each language's library using package managers, or visit their GitHub repos for further engagement: Python Library: google-genai GitHub Repository: googleapis/python-genai Installation: pip install google-genai JavaScript Library: @google/genai GitHub Repository: googleapis/js-genai Installation: npm install @google/genai Go Library: google.golang.org/genai GitHub Repository: googleapis/go-genai Installation: go get google.golang.org/genai Java Library: google-genai GitHub Repository: googleapis/java-genai Installation: If you're using Maven, add the following to your dependencies: com.google.genai google-genai 1.0.0 General availability We started rolling out the Google GenAI SDK in late 2024. As of May 2025, it reached General Availability (GA) across all supported platforms. This means \ No newline at end of file diff --git a/docstore/6cd69745-415c-404e-bf6b-07529a45e171 b/docstore/6cd69745-415c-404e-bf6b-07529a45e171 new file mode 100644 index 0000000000000000000000000000000000000000..43d235dee43f472c269052714c9705874463b9cd --- /dev/null +++ b/docstore/6cd69745-415c-404e-bf6b-07529a45e171 @@ -0,0 +1 @@ +tools ]) # Define user prompt contents = [ types . Content ( role = "user" , parts = [ types . Part ( text = "Turn the lights down to a romantic level" )] ) ] # Send request with function declarations response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = contents config = config , ) print ( response . candidates [ 0 ] . content . parts [ 0 ] . function_call ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Generation config with function declaration const config = { tools : [{ functionDeclarations : [ setLightValuesFunctionDeclaration ] }] }; // Configure the client const ai = new GoogleGenAI ({}); // Define user prompt const contents = [ { role : 'user' , parts : [{ text : 'Turn the lights down to a romantic level' }] } ]; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( response . functionCalls [ 0 ]); The model then returns a functionCall object in an OpenAPI compatible schema specifying how to call one or more of the declared functions in order to respond to the user's question. Python id = None args = { 'color_temp' : 'warm' , 'brightness' : 25 } name = 'set_light_values' JavaScript { name : 'set_light_values' , args : { brightness : 25 , color_temp : 'warm' } } Step 3: Execute set_light_values function code Extract the function call details from the model's response, parse the arguments , and execute the set_light_values function. Python # Extract tool call details, it may not be in the first part. tool_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call if tool_call . name == "set_light_values" : result = set_light_values ( ** tool_call . args ) print ( f "Function execution result: { result } " ) JavaScript // Extract tool call details const tool_call = response . functionCalls [ 0 ] let result ; if ( tool_call . name === 'set_light_values' ) { result = \ No newline at end of file diff --git a/docstore/6ce29346-c4c1-4c69-ab1b-96a87754a529 b/docstore/6ce29346-c4c1-4c69-ab1b-96a87754a529 new file mode 100644 index 0000000000000000000000000000000000000000..9b0480f81615786d4da1611ac72ab96b5af43602 --- /dev/null +++ b/docstore/6ce29346-c4c1-4c69-ab1b-96a87754a529 @@ -0,0 +1 @@ +"CiIBVKhc7oDPpCaXyJKKssjqr4g3JNOSgJ/M2V+1THC1icsWCmwBVKhc7pBABbZ+zR3e9234WnWWS6GFXmf8IVwpnzjd5KYd7vyJbn/4vTorWBGayj/vbd9JPaZQjxdAIXhoE5mX/MDsQ7M9N/b0qJjHm39tYIBvS4sIWkMDHqTJqXGLzhhKtrTkfbV3RbaJEkQKmwEBVKhc7qVUgC3hfTXZLo9R3AJzUUIx50NKvJTb9B+UU+LBqgg7Nck1x5OpjWVS2R+SsveprIuYOruk2Y0H53J2OJF8qsxTdIq2si8DGW2V7WK8xyoJH5kbqd7drIw1jLb44b6lx4SMyB0VaULuTBki4d+Ljjg1tJTwR0IYMKqDLDZt9mheINsi0ZxcNjfpnDydRXdWbcSwzmK/wgqJAQFUqFzuKgNVElxs3cbO+xebr2IwcOro84nKTisi0tTp9bICPC9fTUhn3L+rvQWA+d3J1Za8at2bakrqiRj7BTh+CVO9fWQMAEQAs3ni0Z2hfaYG92tOD26E4IoZwyYEoWbfNudpH1fr5tEkyqnEGtWIh7H+XoZQ2DXeiOa+br7Zk88SrNE+trJMCogBAVSoXO5e9fBLg7hnbkmKsrzNLnQtLsQm1gNzjcjEC7nJYklYPp0KI2uGBE1PkM8XNsfllAfHVn7LzHcHNlbQ9pJ7QZTSIeG42goS971r5wNZwxaXwCTphClQh826eqJWo6A/28TtAVQWLhTx5ekbP7qb4nh1UblESZ1saxDQAEo4OKPbDzx5BgqKAQFUqFzuVyjNm5i0wN8hTDnKjfpDroEpPPTs531iFy9BOX+xDCdGHy8D+osFpaoBq6TFekQQbz4hIoUR1YEcP4zI80/cNimEeb9IcFxZTTxiNrbhbbcv0969DSMWhB+ZEqIz4vuw4GLe/xcUvqhlChQwFdgIbdOQHSHpatn5uDlktnP/bi26nKuXIwo0AVSoXO7US22OUH7d1f4abNPI0IyAvhqkPp12rbtWLx9vkOtojE8IP+xCfYtIFuZIzRNZqA==" } ] , "role" : "model" } , { "role" : "user" , "parts" : [ { "functionResponse" : { "name" : "getWeather" , "response" : { "response" : { "stringValue" : "Sunny and hot. 90 degrees Fahrenheit" } } } } ] } ] , # Remainder of request... Learn more about limitations and usage of thought signatures, and about thinking models in general, on the Thinking page. Parallel function calling In addition to single turn function calling, you can also call multiple functions at once. Parallel function calling lets you execute multiple functions at once and is used when the functions are not dependent on each other. This is useful in scenarios like gathering data from multiple independent sources, such as retrieving customer details from different databases or checking inventory levels across various warehouses or performing multiple actions such as converting your apartment into a disco. Python power_disco_ball = { "name" : "power_disco_ball" , "description" \ No newline at end of file diff --git a/docstore/6d0a48ef-6190-4ed3-ac28-3b37d4f1966b b/docstore/6d0a48ef-6190-4ed3-ac28-3b37d4f1966b new file mode 100644 index 0000000000000000000000000000000000000000..f65bfb5d195a3160683160d98bf38afd321eba5f --- /dev/null +++ b/docstore/6d0a48ef-6190-4ed3-ac28-3b37d4f1966b @@ -0,0 +1 @@ +Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/6d1a73f6-2431-4c6f-b6df-b9adc8ac0d84 b/docstore/6d1a73f6-2431-4c6f-b6df-b9adc8ac0d84 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/6d1a73f6-2431-4c6f-b6df-b9adc8ac0d84 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/6d37b25c-7e54-4a7b-9bf6-822ddb664a20 b/docstore/6d37b25c-7e54-4a7b-9bf6-822ddb664a20 new file mode 100644 index 0000000000000000000000000000000000000000..9670cc28c51e28ab1129c38367a73e00f33ff60c --- /dev/null +++ b/docstore/6d37b25c-7e54-4a7b-9bf6-822ddb664a20 @@ -0,0 +1 @@ +meal features brown rice, roasted vegetables, and chicken teriyaki. The brown rice is a healthy and complex carbohydrate that will give you sustained energy. The roasted vegetables are a great way to get your daily dose of vitamins and minerals, and the chicken teriyaki is a delicious and protein-rich option. This meal is also very easy to prepare. Simply cook the brown rice, roast the vegetables, and cook the chicken teriyaki. Once everything is cooked, divide it into meal prep containers and store them in the refrigerator. You can then grab a container and go on busy mornings! If you're looking for a healthy and delicious meal that can be easily prepped ahead of time, this meal is a great option. It's packed with nutrients and flavor, and it's sure to keep you feeling full and satisfied. Here's to healthy and delicious meal prepping! If you are having trouble getting the output you want from prompts that use media files, there are some strategies that can help you get the results you want. The following sections provide design approaches and troubleshooting tips for improving prompts that use multimodal input. You can improve your multimodal prompts by following these best practices: Prompt design fundamentals Be specific in your instructions : Craft clear and concise instructions that leave minimal room for misinterpretation. Add a few examples to your prompt: Use realistic few-shot examples to illustrate what you want to achieve. Break it down step-by-step : Divide complex tasks into manageable sub-goals, guiding the model through the process. Specify the output format : In your prompt, ask for the output to be in the format you want, like markdown, JSON, HTML and more. Put your image first for single-image prompts : While Gemini can handle image and text inputs in any order, for prompts containing a single image, it might perform better if that image (or video) is placed before the text prompt. However, for prompts that require images to be highly interleaved \ No newline at end of file diff --git a/docstore/6d3c2a39-2301-4f02-9219-f220cb39576b b/docstore/6d3c2a39-2301-4f02-9219-f220cb39576b new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/6d3c2a39-2301-4f02-9219-f220cb39576b @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/6d9163a4-4b89-4fbd-ab26-af06e3c7d830 b/docstore/6d9163a4-4b89-4fbd-ab26-af06e3c7d830 new file mode 100644 index 0000000000000000000000000000000000000000..39af00a0c9a9d748f5b17ca6db1ebff5b823541e --- /dev/null +++ b/docstore/6d9163a4-4b89-4fbd-ab26-af06e3c7d830 @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " Delete uploaded files Files are automatically deleted after 48 hours. You can also manually delete an uploaded file: Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) client . files . delete ( name = myfile . name ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; await ai . files . delete ({ name : fileName }); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } client . DeleteFile ( ctx , file . Name ) REST curl --request "DELETE" https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " Usage info You can use the Files API to upload and interact with media files. The Files API lets you store up to 20 GB of files per project, with a per-file maximum size of 2 GB. Files are stored for 48 hours. During that time, you can use the API to get metadata about the files, but you can't download the files. The Files API is available at no cost in all regions where the Gemini API is available. File prompting strategies This section provides guidance and best practices for using media files with prompts for the Gemini API. Being able to use various types of data in your prompts gives you more flexibility in terms of what tasks you can tackle with the Gemini API. For example, you can send the model a photo of a delicious meal and ask it to write a short blog about the meal. Prompt Response Write a short, engaging blog post based on this picture. It should include a description of the meal in the photo and talk about my journey meal prepping. Meal prepping is a great way to save time and money, and it can also help you to eat healthier. This meal is a great example of a healthy and delicious meal that can be easily prepped ahead of time. This \ No newline at end of file diff --git a/docstore/6d9a2386-3b2f-4a90-b5a8-916a5d21e19b b/docstore/6d9a2386-3b2f-4a90-b5a8-916a5d21e19b new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/6d9a2386-3b2f-4a90-b5a8-916a5d21e19b @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/6dbc48c9-f803-4aa7-8ab1-e02daa7bb710 b/docstore/6dbc48c9-f803-4aa7-8ab1-e02daa7bb710 new file mode 100644 index 0000000000000000000000000000000000000000..d91888efdf44ca016580b2841f794e9350a48393 --- /dev/null +++ b/docstore/6dbc48c9-f803-4aa7-8ab1-e02daa7bb710 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/function-calling#automatic_function_calling_python_only Title: Function calling with the Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/6dce3a2b-4daa-458e-8830-c7b81421958c b/docstore/6dce3a2b-4daa-458e-8830-c7b81421958c new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/6dce3a2b-4daa-458e-8830-c7b81421958c @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/6dd320ca-6158-4164-8d00-7adc04db3ea1 b/docstore/6dd320ca-6158-4164-8d00-7adc04db3ea1 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/6dd320ca-6158-4164-8d00-7adc04db3ea1 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/6e0415ab-5b8f-4ea9-9cfa-19058252e025 b/docstore/6e0415ab-5b8f-4ea9-9cfa-19058252e025 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/6e0415ab-5b8f-4ea9-9cfa-19058252e025 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/6e22f754-bdce-48c4-801f-547f5970c8f1 b/docstore/6e22f754-bdce-48c4-801f-547f5970c8f1 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/6e22f754-bdce-48c4-801f-547f5970c8f1 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/6e3daf25-9b1a-48c7-8da3-26ea2d9ce275 b/docstore/6e3daf25-9b1a-48c7-8da3-26ea2d9ce275 new file mode 100644 index 0000000000000000000000000000000000000000..3f35d7c2ee0452cbbcb055812399e279fb8f7031 --- /dev/null +++ b/docstore/6e3daf25-9b1a-48c7-8da3-26ea2d9ce275 @@ -0,0 +1 @@ +$GEMINI_API_KEY " \ -X POST \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts":[{ "text": "TTS the following conversation between Joe and Jane: Joe: Hows it going today Jane? Jane: Not too bad, how about you?" }] }], "generationConfig": { "responseModalities": ["AUDIO"], "speechConfig": { "multiSpeakerVoiceConfig": { "speakerVoiceConfigs": [{ "speaker": "Joe", "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Kore" } } }, { "speaker": "Jane", "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Puck" } } }] } } }, "model": "gemini-2.5-flash-preview-tts", }' | jq -r '.candidates[0].content.parts[0].inlineData.data' | \ base64 --decode > out.pcm # You may need to install ffmpeg. ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm out.wav Controlling speech style with prompts You can control style, tone, accent, and pace using natural language prompts for both single- and multi-speaker TTS. For example, in a single-speaker prompt, you can say: Say in an spooky whisper: "By the pricking of my thumbs... Something wicked this way comes" In a multi-speaker prompt, provide the model with each speaker's name and corresponding transcript. You can also provide guidance for each speaker individually: Make Speaker1 sound tired and bored, and Speaker2 sound excited and happy: Speaker1: So... what's on the agenda today? Speaker2: You're never going to guess! Try using a voice option that corresponds to the style or emotion you want to convey, to emphasize it even more. In the previous prompt, for example, Enceladus 's breathiness might emphasize "tired" and "bored", while Puck 's upbeat tone could complement "excited" and "happy". Generating a prompt to convert to audio The TTS models only output audio, but you can use other models to generate a transcript first, then pass that transcript to the TTS model to read aloud. Python from google import genai from google.genai import types client = genai . Client () transcript = client . models . generate_content ( model \ No newline at end of file diff --git a/docstore/6e54a9fa-c84a-4bfa-b5eb-c8403ca8cdbc b/docstore/6e54a9fa-c84a-4bfa-b5eb-c8403ca8cdbc new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/6e54a9fa-c84a-4bfa-b5eb-c8403ca8cdbc @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/6e5b924e-eedc-40ff-8a2a-9f0323e4b2c0 b/docstore/6e5b924e-eedc-40ff-8a2a-9f0323e4b2c0 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/6e5b924e-eedc-40ff-8a2a-9f0323e4b2c0 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/6e64867b-a9a3-4a41-b437-7cdd0fd1ba59 b/docstore/6e64867b-a9a3-4a41-b437-7cdd0fd1ba59 new file mode 100644 index 0000000000000000000000000000000000000000..fde5008e10da059aa2ac847e9fab5e369116574b --- /dev/null +++ b/docstore/6e64867b-a9a3-4a41-b437-7cdd0fd1ba59 @@ -0,0 +1 @@ +You can set fields as required to force the model to provide a value. If there's insufficient context in the associated input prompt, the model generates responses mainly based on the data it was trained on. A complex schema can result in an InvalidArgument: 400 error. Complexity might come from long property names, long array length limits, enums with many values, objects with lots of optional properties, or a combination of these factors. If you get this error with a valid schema, make one or more of the following changes to resolve the error: Shorten property names or enum names. Flatten nested arrays. Reduce the number of properties with constraints, such as numbers with minimum and maximum limits. Reduce the number of properties with complex constraints, such as properties with complex formats like date-time . Reduce the number of optional properties. Reduce the number of valid values for enums. If you aren't seeing the results you expect, add more context to your input prompts or revise your response schema. For example, review the model's response without structured output to see how the model responds. You can then update your response schema so that it better fits the model's output. What's next Now that you've learned how to generate structured output, you might want to try using Gemini API tools: Function calling Code execution Grounding with Google Search Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/6e66f65e-a5ba-4ba9-9cd2-69cca51c7609 b/docstore/6e66f65e-a5ba-4ba9-9cd2-69cca51c7609 new file mode 100644 index 0000000000000000000000000000000000000000..58a0a85d29a664bbdbdb8d3a93c58460ef8ef554 --- /dev/null +++ b/docstore/6e66f65e-a5ba-4ba9-9cd2-69cca51c7609 @@ -0,0 +1 @@ += []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); And here is a text-to-audio example. You can receive audio by setting AUDIO as response modality. This example saves the received data as WAV file: \ No newline at end of file diff --git a/docstore/6e7cc833-6526-4db9-bef9-ff7526624109 b/docstore/6e7cc833-6526-4db9-bef9-ff7526624109 new file mode 100644 index 0000000000000000000000000000000000000000..2e8ac78d46cd543ec267658a40dc30e0a2feb077 --- /dev/null +++ b/docstore/6e7cc833-6526-4db9-bef9-ff7526624109 @@ -0,0 +1 @@ +Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { SystemInstruction : genai . NewContentFromText ( "You are a cat. Your name is Neko." , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Hello there" ), config , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "system_instruction": { "parts": [ { "text": "You are a cat. Your name is Neko." } ] }, "contents": [ { "parts": [ { "text": "Hello there" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const systemInstruction = { parts : [{ text : 'You are a cat. Your name is Neko.' }] }; const payload = { systemInstruction , contents : [ { parts : [ { text : 'Hello there' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } The GenerateContentConfig object also lets you override default generation parameters, such as temperature . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model \ No newline at end of file diff --git a/docstore/6eaac96a-1aec-446a-85ee-1f069391f712 b/docstore/6eaac96a-1aec-446a-85ee-1f069391f712 new file mode 100644 index 0000000000000000000000000000000000000000..6ba79c2c7031e707d89e2b23470a5d9ee375cc5c --- /dev/null +++ b/docstore/6eaac96a-1aec-446a-85ee-1f069391f712 @@ -0,0 +1 @@ +mcp.client.stdio import stdio_client from google import genai client = genai . Client () # Create server parameters for stdio connection server_params = StdioServerParameters ( command = "npx" , # Executable args = [ "-y" , "@philschmid/weather-mcp" ], # MCP Server env = None , # Optional environment variables ) async def run (): async with stdio_client ( server_params ) as ( read , write ): async with ClientSession ( read , write ) as session : # Prompt to get the weather for the current day in London. prompt = f "What is the weather in London in { datetime . now () . strftime ( '%Y-%m- %d ' ) } ?" # Initialize the connection between client and server await session . initialize () # Send request to the model with MCP function declarations response = await client . aio . models . generate_content ( model = "gemini-2.5-flash" , contents = prompt , config = genai . types . GenerateContentConfig ( temperature = 0 , tools = [ session ], # uses the session, will automatically call the tool # Uncomment if you **don't** want the SDK to automatically call the tool # automatic_function_calling=genai.types.AutomaticFunctionCallingConfig( # disable=True # ), ), ) print ( response . text ) # Start the asyncio event loop and run the main function asyncio . run ( run ()) JavaScript Make sure the latest version of the mcp SDK is installed on your platform of choice. npm install @modelcontextprotocol/sdk Note: JavaScript supports automatic tool calling by wrapping the client with mcpToTool . If you want to disable it, you can provide automaticFunctionCalling with disabled true . import { GoogleGenAI , FunctionCallingConfigMode , mcpToTool } from '@google/genai' ; import { Client } from "@modelcontextprotocol/sdk/client/index.js" ; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js" ; // Create server parameters for stdio connection const serverParams = new StdioClientTransport ({ command : "npx" , // Executable args : [ "-y" , "@philschmid/weather-mcp" \ No newline at end of file diff --git a/docstore/6eba97a5-fa0e-413f-ae71-0351151c2427 b/docstore/6eba97a5-fa0e-413f-ae71-0351151c2427 new file mode 100644 index 0000000000000000000000000000000000000000..4542d68c23b70ac5014267f60edbcffd2138877f --- /dev/null +++ b/docstore/6eba97a5-fa0e-413f-ae71-0351151c2427 @@ -0,0 +1 @@ +, headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Streaming can also be used for multi-turn conversations. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message_stream ( "I have 2 dogs in my house." ) for chunk in response : print ( chunk . text , end = "" ) response = chat . send_message_stream ( "How many paws are in my house?" ) for chunk in response : print ( chunk . text , end = "" ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const stream1 = await chat . sendMessageStream ({ message : "I have 2 dogs in my house." , }); for await ( const chunk of stream1 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } const stream2 = await chat . sendMessageStream ({ message : "How many paws are in my house?" , }); for await ( const chunk of stream2 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . \ No newline at end of file diff --git a/docstore/6ed2195c-6b1b-4b61-8766-7ff062d23ff7 b/docstore/6ed2195c-6b1b-4b61-8766-7ff062d23ff7 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/6ed2195c-6b1b-4b61-8766-7ff062d23ff7 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/6ede756d-5794-4b71-a0d4-8318d2f1b4be b/docstore/6ede756d-5794-4b71-a0d4-8318d2f1b4be new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/6ede756d-5794-4b71-a0d4-8318d2f1b4be @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/6ef7f0ad-804b-46f0-bf2b-c46a42f6ae8b b/docstore/6ef7f0ad-804b-46f0-bf2b-c46a42f6ae8b new file mode 100644 index 0000000000000000000000000000000000000000..a97f8e9b4e6ce1d85f8c4b9bed31a9676bb7c602 --- /dev/null +++ b/docstore/6ef7f0ad-804b-46f0-bf2b-c46a42f6ae8b @@ -0,0 +1 @@ +Files API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Files API The Gemini family of artificial intelligence (AI) models is built to handle various types of input data, including text, images, and audio. Since these models can handle more than one type or mode of data, the Gemini models are called multimodal models or explained as having multimodal capabilities . This guide shows you how to work with media files using the Files API. The basic operations are the same for audio files, images, videos, documents, and other supported file types. For file prompting guidance, check out the File prompt guide section. Upload a file You can use the Files API to upload a media file. Always use the Files API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20 MB. The following code uploads a file and then uses the file in a call to generateContent . Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp3" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ "Describe this audio clip" , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Describe this audio \ No newline at end of file diff --git a/docstore/6f3bd3a0-b7d5-4496-998a-2f3fbdba0b5b b/docstore/6f3bd3a0-b7d5-4496-998a-2f3fbdba0b5b new file mode 100644 index 0000000000000000000000000000000000000000..980cad742ce4bdad224c6b76fd35613451194dd7 --- /dev/null +++ b/docstore/6f3bd3a0-b7d5-4496-998a-2f3fbdba0b5b @@ -0,0 +1 @@ +parameters : { type : Type . OBJECT , properties : { energetic : { type : Type . BOOLEAN , description : 'Whether the music is energetic or not.' }, loud : { type : Type . BOOLEAN , description : 'Whether the music is loud or not.' } }, required : [ 'energetic' , 'loud' ] } }; const dimLights = { name : 'dim_lights' , description : 'Dim the lights.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'The brightness of the lights, 0.0 is off, 1.0 is full.' } }, required : [ 'brightness' ] } }; Configure the function calling mode to allow using all of the specified tools. To learn more, you can read about configuring function calling . Python from google import genai from google.genai import types # Configure the client and tools client = genai . Client () house_tools = [ types . Tool ( function_declarations = [ power_disco_ball , start_music , dim_lights ]) ] config = types . GenerateContentConfig ( tools = house_tools , automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ), # Force the model to call 'any' function, instead of chatting. tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = 'ANY' ) ), ) chat = client . chats . create ( model = "gemini-2.5-flash" , config = config ) response = chat . send_message ( "Turn this place into a party!" ) # Print out each of the function calls requested from this single call print ( "Example 1: Forced function calling" ) for fn in response . function_calls : args = ", " . join ( f " { key } = { val } " for key , val in fn . args . items ()) print ( f " { fn . name } ( { args } )" ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Set up function declarations const houseFns = [ powerDiscoBall , startMusic , dimLights ]; const config = { tools : [{ functionDeclarations : houseFns }], // Force the model to call 'any' function, instead of chatting. toolConfig : { functionCallingConfig : \ No newline at end of file diff --git a/docstore/6f5d95dc-eafd-4b85-b4f3-b2be23c7c72e b/docstore/6f5d95dc-eafd-4b85-b4f3-b2be23c7c72e new file mode 100644 index 0000000000000000000000000000000000000000..5ec9ece3755d9588328841497099189b5d8c6b8d --- /dev/null +++ b/docstore/6f5d95dc-eafd-4b85-b4f3-b2be23c7c72e @@ -0,0 +1 @@ +5. Forever Blooms Task Give me a simple list of just the things that I must bring on a camping trip. The list should have 5 items. 1. **Tent:** Your shelter from the elements. 2. **Sleeping Bag:** Stay warm and comfortable. 3. **Sleeping Pad:** Insulation and cushioning for a good night's sleep. 4. **Headlamp or Flashlight:** Light for nighttime activities. 5. **First Aid Kit:** Be prepared for minor injuries. Entity Classify the following items as [large, small]: Elephant Mouse Snail Here's the classification: * **Elephant:** Large * **Mouse:** Small * **Snail:** Small Partial input completion Generative language models work like an advanced auto completion tool. When you provide partial content, the model can provide the rest of the content or what it thinks is a continuation of that content as a response. When doing so, if you include any examples or context, the model can take those examples or context into account. The following example provides a prompt with an instruction and an entity input: Prompt: For the given order, return a JSON object that has the fields cheeseburger, hamburger, fries, or drink, with the value being the quantity. Order: A burger and a drink. Response: { "cheeseburger": 0, "hamburger": 1, "fries": 0, "drink": 1 } (gemini-2.5-flash) While the model did as prompted, writing out the instructions in natural language can sometimes be challenging and it leaves a lot to the model's interpretation. For example, a restaurants menu might contain many items. To reduce the size of the JSON response, you probably want to omit the items that weren't ordered. In this case, you can give an example and a response prefix and let the model complete it: Prompt: Valid fields are cheeseburger, hamburger, fries, and drink. Order: Give me a cheeseburger and fries Output: ``` { "cheeseburger": 1, "fries": 1 } ``` Order: I want two burgers, a drink, and fries. Output: Response: ``` { "hamburger": 2, "drink": 1, "fries": 1 } ``` (gemini-2.5-flash) Notice how \ No newline at end of file diff --git a/docstore/6f7c277c-5d29-419e-af6c-96fc02ad0a53 b/docstore/6f7c277c-5d29-419e-af6c-96fc02ad0a53 new file mode 100644 index 0000000000000000000000000000000000000000..7dc87b548e2d57526821a9c12df5e47c7e7e0e83 --- /dev/null +++ b/docstore/6f7c277c-5d29-419e-af6c-96fc02ad0a53 @@ -0,0 +1 @@ +. thoughtsTokenCount } ` ); console . log ( `Output tokens: ${ response . usageMetadata . candidatesTokenCount } ` ); Go // ... usageMetadata , err := json . MarshalIndent ( response . UsageMetadata , "" , " " ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Thoughts tokens:" , string ( usageMetadata . thoughts_token_count )) fmt . Println ( "Output tokens:" , string ( usageMetadata . candidates_token_count )) Thinking models generate full thoughts to improve the quality of the final response, and then output summaries to provide insight into the thought process. So, pricing is based on the full thought tokens the model needs to generate to create a summary, despite only the summary being output from the API. You can learn more about tokens in the Token counting guide. Supported models Thinking features are supported on all the 2.5 series models. You can find all model capabilities on the model overview page. Best practices This section includes some guidance for using thinking models efficiently. As always, following our prompting guidance and best practices will get you the best results. Debugging and steering Review reasoning : When you're not getting your expected response from the thinking models, it can help to carefully analyze Gemini's thought summaries. You can see how it broke down the task and arrived at its conclusion, and use that information to correct towards the right results. Provide Guidance in Reasoning : If you're hoping for a particularly lengthy output, you may want to provide guidance in your prompt to constrain the amount of thinking the model uses. This lets you reserve more of the token output for your response. Task complexity Easy Tasks (Thinking could be OFF): For straightforward requests where complex reasoning isn't required, such as fact retrieval or classification, thinking is not required. Examples include: "Where was DeepMind founded?" "Is this email asking for a meeting or just providing information?" Medium Tasks \ No newline at end of file diff --git a/docstore/6f88bee4-54c4-47c2-a782-85210f115ba6 b/docstore/6f88bee4-54c4-47c2-a782-85210f115ba6 new file mode 100644 index 0000000000000000000000000000000000000000..35cb4724ff25e15eeabaea84a051d2545b73055e --- /dev/null +++ b/docstore/6f88bee4-54c4-47c2-a782-85210f115ba6 @@ -0,0 +1 @@ +model to produce concise responses, you can include examples in the prompt that give preference to concise responses. The following prompt provides two examples that show preference to the shorter explanations. In the response, you can see that the examples guided the model to choose the shorter explanation ( Explanation2 ) as opposed to the longer explanation ( Explanation1 ) like it did previously. Prompt: Below are some examples showing a question, explanation, and answer format: Question: Why is the sky blue? Explanation1: The sky appears blue because of Rayleigh scattering, which causes shorter blue wavelengths of light to be scattered more easily than longer red wavelengths, making the sky look blue. Explanation2: Due to Rayleigh scattering effect. Answer: Explanation2 Question: What is the cause of earthquakes? Explanation1: Sudden release of energy in the Earth's crust. Explanation2: Earthquakes happen when tectonic plates suddenly slip or break apart, causing a release of energy that creates seismic waves that can shake the ground and cause damage. Answer: Explanation1 Now, Answer the following question given the example formats above: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Answer: Explanation2 (gemini-2.5-flash) Optimal number of examples Models like Gemini can often pick up on patterns using a few examples, though you may need to experiment with the number of examples to provide in the prompt for the best results. At the same time, if you include too many examples, the model may start to overfit the response to the examples. Patterns vs anti patterns Using examples to show the model a pattern to follow is more effective than using examples to show the model an anti pattern \ No newline at end of file diff --git a/docstore/6f8a2867-68bb-46d9-8f0d-8a9c275518c4 b/docstore/6f8a2867-68bb-46d9-8f0d-8a9c275518c4 new file mode 100644 index 0000000000000000000000000000000000000000..492e06a8c4d039b04fc73ff321e1f8af54508e49 --- /dev/null +++ b/docstore/6f8a2867-68bb-46d9-8f0d-8a9c275518c4 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/available-regions#main-content Title: Available regions for Google AI Studio and Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/6fa3116e-c676-44e0-9dbf-48da50fab9cf b/docstore/6fa3116e-c676-44e0-9dbf-48da50fab9cf new file mode 100644 index 0000000000000000000000000000000000000000..90311e83465fc930174e1787a7822757d628fc0a --- /dev/null +++ b/docstore/6fa3116e-c676-44e0-9dbf-48da50fab9cf @@ -0,0 +1 @@ +setLightValues ( tool_call . args . brightness , tool_call . args . color_temp ); console . log ( `Function execution result: ${ JSON . stringify ( result ) } ` ); } Step 4: Create user friendly response with function result and call the model again Finally, send the result of the function execution back to the model so it can incorporate this information into its final response to the user. Python # Create a function response part function_response_part = types . Part . from_function_response ( name = tool_call . name , response = { "result" : result }, ) # Append function call and result of the function execution to contents contents . append ( response . candidates [ 0 ] . content ) # Append the content from the model's response. contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Create a function response part const function_response_part = { name : tool_call . name , response : { result } } // Append function call and result of the function execution to contents contents . push ( response . candidates [ 0 ]. content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); // Get the final response from the model const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); This completes the function calling flow. The model successfully used the set_light_values function to perform the request action of the user. Function declarations When you implement function calling in a prompt, you create a tools object, which contains one or more function declarations . You define functions using JSON, specifically with a select subset of the OpenAPI schema format. A single function \ No newline at end of file diff --git a/docstore/6fad3bd1-8391-4966-aaae-a568a144fa60 b/docstore/6fad3bd1-8391-4966-aaae-a568a144fa60 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/6fad3bd1-8391-4966-aaae-a568a144fa60 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/6fb15240-01bb-4bed-8d60-21d5c483d55e b/docstore/6fb15240-01bb-4bed-8d60-21d5c483d55e new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/6fb15240-01bb-4bed-8d60-21d5c483d55e @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/6fc4cd0f-dc97-4561-b724-3a700688e1d7 b/docstore/6fc4cd0f-dc97-4561-b724-3a700688e1d7 new file mode 100644 index 0000000000000000000000000000000000000000..cb7319bb995c708a315638a1177ba448f5c39210 --- /dev/null +++ b/docstore/6fc4cd0f-dc97-4561-b724-3a700688e1d7 @@ -0,0 +1 @@ +: "Powers the spinning disco ball." , "parameters" : { "type" : "object" , "properties" : { "power" : { "type" : "boolean" , "description" : "Whether to turn the disco ball on or off." , } }, "required" : [ "power" ], }, } start_music = { "name" : "start_music" , "description" : "Play some music matching the specified parameters." , "parameters" : { "type" : "object" , "properties" : { "energetic" : { "type" : "boolean" , "description" : "Whether the music is energetic or not." , }, "loud" : { "type" : "boolean" , "description" : "Whether the music is loud or not." , }, }, "required" : [ "energetic" , "loud" ], }, } dim_lights = { "name" : "dim_lights" , "description" : "Dim the lights." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "number" , "description" : "The brightness of the lights, 0.0 is off, 1.0 is full." , } }, "required" : [ "brightness" ], }, } JavaScript import { Type } from '@google/genai' ; const powerDiscoBall = { name : 'power_disco_ball' , description : 'Powers the spinning disco ball.' , parameters : { type : Type . OBJECT , properties : { power : { type : Type . BOOLEAN , description : 'Whether to turn the disco ball on or off.' } }, required : [ 'power' ] } }; const startMusic = { name : 'start_music' , description : 'Play some music matching the specified parameters.' , parameters : { type : Type . OBJECT , properties : { energetic : { type : Type . BOOLEAN , description : 'Whether the music is energetic or not.' }, loud : { type : Type . BOOLEAN , description : 'Whether the music is loud or not.' } }, required : [ 'energetic' , 'loud' ] } }; const dimLights = { name : 'dim_lights' , description : 'Dim the lights.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'The brightness of the lights, 0.0 is off, 1.0 is full.' } }, required : [ 'brightness' ] } }; Configure the function calling mode to allow using all of the specified tools. To learn more, \ No newline at end of file diff --git a/docstore/6fd6bbcc-93de-4e89-b1f7-fe4b0494fa5c b/docstore/6fd6bbcc-93de-4e89-b1f7-fe4b0494fa5c new file mode 100644 index 0000000000000000000000000000000000000000..9670cc28c51e28ab1129c38367a73e00f33ff60c --- /dev/null +++ b/docstore/6fd6bbcc-93de-4e89-b1f7-fe4b0494fa5c @@ -0,0 +1 @@ +meal features brown rice, roasted vegetables, and chicken teriyaki. The brown rice is a healthy and complex carbohydrate that will give you sustained energy. The roasted vegetables are a great way to get your daily dose of vitamins and minerals, and the chicken teriyaki is a delicious and protein-rich option. This meal is also very easy to prepare. Simply cook the brown rice, roast the vegetables, and cook the chicken teriyaki. Once everything is cooked, divide it into meal prep containers and store them in the refrigerator. You can then grab a container and go on busy mornings! If you're looking for a healthy and delicious meal that can be easily prepped ahead of time, this meal is a great option. It's packed with nutrients and flavor, and it's sure to keep you feeling full and satisfied. Here's to healthy and delicious meal prepping! If you are having trouble getting the output you want from prompts that use media files, there are some strategies that can help you get the results you want. The following sections provide design approaches and troubleshooting tips for improving prompts that use multimodal input. You can improve your multimodal prompts by following these best practices: Prompt design fundamentals Be specific in your instructions : Craft clear and concise instructions that leave minimal room for misinterpretation. Add a few examples to your prompt: Use realistic few-shot examples to illustrate what you want to achieve. Break it down step-by-step : Divide complex tasks into manageable sub-goals, guiding the model through the process. Specify the output format : In your prompt, ask for the output to be in the format you want, like markdown, JSON, HTML and more. Put your image first for single-image prompts : While Gemini can handle image and text inputs in any order, for prompts containing a single image, it might perform better if that image (or video) is placed before the text prompt. However, for prompts that require images to be highly interleaved \ No newline at end of file diff --git a/docstore/6fec8763-9dd6-4237-a8c6-420c4d72dce4 b/docstore/6fec8763-9dd6-4237-a8c6-420c4d72dce4 new file mode 100644 index 0000000000000000000000000000000000000000..d848b8568b3ba7e96106d5699014f6db42af09d2 --- /dev/null +++ b/docstore/6fec8763-9dd6-4237-a8c6-420c4d72dce4 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/vision#capabilities Title: Image understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/6feef52d-d3cc-4166-9ff1-7dc7da6af9b1 b/docstore/6feef52d-d3cc-4166-9ff1-7dc7da6af9b1 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/6feef52d-d3cc-4166-9ff1-7dc7da6af9b1 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/6ffcf7db-b13e-418b-b2c4-f0dbebf5490f b/docstore/6ffcf7db-b13e-418b-b2c4-f0dbebf5490f new file mode 100644 index 0000000000000000000000000000000000000000..fde5008e10da059aa2ac847e9fab5e369116574b --- /dev/null +++ b/docstore/6ffcf7db-b13e-418b-b2c4-f0dbebf5490f @@ -0,0 +1 @@ +You can set fields as required to force the model to provide a value. If there's insufficient context in the associated input prompt, the model generates responses mainly based on the data it was trained on. A complex schema can result in an InvalidArgument: 400 error. Complexity might come from long property names, long array length limits, enums with many values, objects with lots of optional properties, or a combination of these factors. If you get this error with a valid schema, make one or more of the following changes to resolve the error: Shorten property names or enum names. Flatten nested arrays. Reduce the number of properties with constraints, such as numbers with minimum and maximum limits. Reduce the number of properties with complex constraints, such as properties with complex formats like date-time . Reduce the number of optional properties. Reduce the number of valid values for enums. If you aren't seeing the results you expect, add more context to your input prompts or revise your response schema. For example, review the model's response without structured output to see how the model responds. You can then update your response schema so that it better fits the model's output. What's next Now that you've learned how to generate structured output, you might want to try using Gemini API tools: Function calling Code execution Grounding with Google Search Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/6ffdb1e6-ec91-467d-873d-c571d0e814a9 b/docstore/6ffdb1e6-ec91-467d-873d-c571d0e814a9 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/6ffdb1e6-ec91-467d-873d-c571d0e814a9 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/70148955-20a4-4fad-ac77-1c6a77ce25ac b/docstore/70148955-20a4-4fad-ac77-1c6a77ce25ac new file mode 100644 index 0000000000000000000000000000000000000000..d56b3daa38a2092b0a61eaf4baf6d81916c2e2ff --- /dev/null +++ b/docstore/70148955-20a4-4fad-ac77-1c6a77ce25ac @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting-strategies#few-shot Title: Prompt design strategies | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/704a8f40-183a-447c-9bd3-6838b5a27772 b/docstore/704a8f40-183a-447c-9bd3-6838b5a27772 new file mode 100644 index 0000000000000000000000000000000000000000..c3dd5ccab4f70d7a2a8a4ffb78dc41dee62c1a3f --- /dev/null +++ b/docstore/704a8f40-183a-447c-9bd3-6838b5a27772 @@ -0,0 +1 @@ +with texts to make sense, use whatever order is most natural. Troubleshooting your multimodal prompt If the model is not drawing information from the relevant part of the image: Drop hints with which aspects of the image you want the prompt to draw information from. If the model output is too generic (not tailored enough to the image/video input): At the start of the prompt, try asking the model to describe the image(s) or video before providing the task instruction, or try asking the model to refer to what's in the image. To troubleshoot which part failed: Ask the model to describe the image, or ask the model to explain its reasoning, to gauge the model's initial understanding. If your prompt results in hallucinated content: Try dialing down the temperature setting or asking the model for shorter descriptions so that it's less likely to extrapolate additional details. Tuning the sampling parameters: Experiment with different temperature settings and top-k selections to adjust the model's creativity. Be specific in your instructions Prompts have the most success when they are clear and detailed. If you have a specific output in mind, it's better to include that requirement in the prompt to ensure you get the output you want. For this image of an airport board, asking the model to just "describe this image" could generate a general description. If you need the model to parse the time and city from the image, you can include that request directly in your prompt. Prompt Model response Describe this image. The image shows an airport arrivals and departures board. Updated prompt Improved response Parse the time and city from the airport board shown in this image into a list. 10:50 Moscow 11:05 Edinburgh 11:05 London 11:10 Bucharest 11:30 Kiev 11:35 Dublin 11:45 East Midlands 12:15 Sofia 12:30 London 12:30 Newcastle 12:40 St Petersburg 12:40 London 12:45 Manchester Add a few examples The Gemini model can accept multiple inputs which it can use as examples to understand \ No newline at end of file diff --git a/docstore/705ecfb0-57ee-43df-a312-4c5b87b98f5c b/docstore/705ecfb0-57ee-43df-a312-4c5b87b98f5c new file mode 100644 index 0000000000000000000000000000000000000000..05ee4a71511b35152369f2b924a4a164d562cc05 --- /dev/null +++ b/docstore/705ecfb0-57ee-43df-a312-4c5b87b98f5c @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/billing#enable-cloud-billing Title: Billing | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/706575b5-148d-4b0b-8ef1-1527a8fabdad b/docstore/706575b5-148d-4b0b-8ef1-1527a8fabdad new file mode 100644 index 0000000000000000000000000000000000000000..48185d96da78033431814f0cadbe551d257fc173 --- /dev/null +++ b/docstore/706575b5-148d-4b0b-8ef1-1527a8fabdad @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/text-generation#system-instructions Title: Text generation | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/70676b7f-276d-453b-868c-7f568bfbb304 b/docstore/70676b7f-276d-453b-868c-7f568bfbb304 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/70676b7f-276d-453b-868c-7f568bfbb304 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/70897c24-0c9c-4602-9eda-46a5a44012f6 b/docstore/70897c24-0c9c-4602-9eda-46a5a44012f6 new file mode 100644 index 0000000000000000000000000000000000000000..fdea6397d0ee0c5ce13453eceb7f458532b87688 --- /dev/null +++ b/docstore/70897c24-0c9c-4602-9eda-46a5a44012f6 @@ -0,0 +1 @@ +"BLOCK_MEDIUM_AND_ABOVE"} ], "contents": [{ "parts":[{ "text": "' I support Martians Soccer Club and I think Jupiterians Football Club sucks! Write a ironic phrase about them. '"}]}]}' > request.json curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d @request.json 2 > /dev/null Next steps See the API reference to learn more about the full API. Review the safety guidance for a general look at safety considerations when developing with LLMs. Learn more about assessing probability versus severity from the Jigsaw team Learn more about the products that contribute to safety solutions like the Perspective API . * You can use these safety settings to create a toxicity classifier. See the classification example to get started. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/708a3ca1-405f-4e24-86ae-1709da755d5d b/docstore/708a3ca1-405f-4e24-86ae-1709da755d5d new file mode 100644 index 0000000000000000000000000000000000000000..b0033fbc695240330e92f0eacef1e843c48482b9 --- /dev/null +++ b/docstore/708a3ca1-405f-4e24-86ae-1709da755d5d @@ -0,0 +1 @@ +Supported values are "16:9" and "9:16" . The default is "16:9" . personGeneration : Allow the model to generate videos of people. The following values are supported: Text-to-video generation: "dont_allow" : Don't allow the inclusion of people or faces. "allow_adult" : Generate videos that include adults, but not children. "allow_all" : Generate videos that include adults and children. Image-to-video generation: "dont_allow" : Don't allow the inclusion of people or faces. "allow_adult" : Generate videos that include adults, but not children. See Limitations . numberOfVideos : Output videos requested, either 1 or 2 . durationSeconds : Length of each output video in seconds, between 5 and 8 . enhance_prompt : Enable or disable the prompt rewriter. Enabled by default. Specifications Modalities Text-to-video generation Image-to-video generation Request latency Min: 11 seconds Max: 6 minutes (during peak hours) Variable length generation 5-8 seconds Resolution 720p Frame rate 24fps Aspect ratio 16:9 - landscape 9:16 - portrait Input languages (text-to-video) English Limitations Image-to-video personGeneration is not allowed in EU, UK, CH, MENA locations Text-to-video personGeneration: "allow_all" is not allowed in EU, UK, CH, MENA locations Note: Check out the Models , Pricing , and Rate limits pages for more usage limitations for Veo. Videos created by Veo are watermarked using SynthID , our tool for watermarking and identifying AI-generated content, and are passed through safety filters and memorization checking processes that help mitigate privacy, copyright and bias risks. Things to try To get the most out of Veo, incorporate video-specific terminology into your prompts. Veo understands a wide range of terms related to: Shot composition: Specify the framing and number of subjects in the shot (e.g., "single shot," "two shot," "over-the-shoulder shot"). Camera positioning and movement: Control the camera's location and movement using terms like "eye level," "high \ No newline at end of file diff --git a/docstore/708e8e7f-489b-49d2-aed9-9e642c9a7ec0 b/docstore/708e8e7f-489b-49d2-aed9-9e642c9a7ec0 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/708e8e7f-489b-49d2-aed9-9e642c9a7ec0 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/70a8f4fa-5ba7-41a3-b25b-a9e89aed9c73 b/docstore/70a8f4fa-5ba7-41a3-b25b-a9e89aed9c73 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/70a8f4fa-5ba7-41a3-b25b-a9e89aed9c73 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/71236dc2-e233-44fd-b985-5454473a3cbd b/docstore/71236dc2-e233-44fd-b985-5454473a3cbd new file mode 100644 index 0000000000000000000000000000000000000000..facd4f50718f089582ed06a7b9e2ea144ce56d9c --- /dev/null +++ b/docstore/71236dc2-e233-44fd-b985-5454473a3cbd @@ -0,0 +1 @@ +instructions, such as: "Show bounding boxes of all green objects in this image". It also support custom labels like "label the items with the allergens they can contain". For more examples, check following notebooks in the Gemini Cookbook : 2D spatial understanding notebook Experimental 3D pointing notebook Segmentation Starting with Gemini 2.5, models not only detect items but also segment them and provide their contour masks. The model predicts a JSON list, where each item represents a segmentation mask. Each item has a bounding box (" box_2d ") in the format [y0, x0, y1, x1] with normalized coordinates between 0 and 1000, a label (" label ") that identifies the object, and finally the segmentation mask inside the bounding box, as base64 encoded png that is a probability map with values between 0 and 255. The mask needs to be resized to match the bounding box dimensions, then binarized at your confidence threshold (127 for the midpoint). Note: For better results, disable thinking by setting the thinking budget to 0. See code sample below for an example. Python from google import genai from google.genai import types from PIL import Image , ImageDraw import io import base64 import json import numpy as np import os client = genai . Client () def parse_json ( json_output : str ): # Parsing out the markdown fencing lines = json_output . splitlines () for i , line in enumerate ( lines ): if line == "```json" : json_output = " \n " . join ( lines [ i + 1 :]) # Remove everything before "```json" output = json_output . split ( "```" )[ 0 ] # Remove everything after the closing "```" break # Exit the loop once "```json" is found return json_output def extract_segmentation_masks ( image_path : str , output_dir : str = "segmentation_outputs" ): # Load and resize image im = Image . open ( image_path ) im . thumbnail ([ 1024 , 1024 ], Image . Resampling . LANCZOS ) prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks \ No newline at end of file diff --git a/docstore/7126491a-a7a3-47d7-b6a9-92d3be6bb2a3 b/docstore/7126491a-a7a3-47d7-b6a9-92d3be6bb2a3 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/7126491a-a7a3-47d7-b6a9-92d3be6bb2a3 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/712d5a3f-1427-4204-8dcf-231d9798af8a b/docstore/712d5a3f-1427-4204-8dcf-231d9798af8a new file mode 100644 index 0000000000000000000000000000000000000000..1b590529498df147a73d1ae4a22f439d08a36cb6 --- /dev/null +++ b/docstore/712d5a3f-1427-4204-8dcf-231d9798af8a @@ -0,0 +1 @@ +from "node:fs" ; const ai = new GoogleGenAI ({}); const base64ImageFile = fs . readFileSync ( "path/to/small-sample.jpg" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "image/jpeg" , data : base64ImageFile , }, }, { text : "Caption this image." }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go bytes , _ := os . ReadFile ( "path/to/small-sample.jpg" ) parts := [] * genai . Part { genai . NewPartFromBytes ( bytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST IMG_PATH = "/path/to/your/image1.jpg" if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "' " $( base64 $B64FLAGS $IMG_PATH ) " '" } }, {"text": "Caption this image."}, ] }] }' 2 > /dev/null You can also fetch an image from a URL, convert it to bytes, and pass it to generateContent as shown in the following examples. Python from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests . get ( image_path ) . content image = types . Part . from_bytes ( data = image_bytes , mime_type = "image/jpeg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is this image?" , image ], ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; \ No newline at end of file diff --git a/docstore/7141c392-966f-4c37-88c6-24290a5fdd8f b/docstore/7141c392-966f-4c37-88c6-24290a5fdd8f new file mode 100644 index 0000000000000000000000000000000000000000..da790334610df2eb1eb6ac9bb30354d4f1eafe8b --- /dev/null +++ b/docstore/7141c392-966f-4c37-88c6-24290a5fdd8f @@ -0,0 +1 @@ +the output you want. Adding these examples can help the model identify the patterns and apply the relationship between the given images and responses to the new example. This is also called "few-shot" learning. In the example below, the initial output is written in sentence form, and also contains the country (Brazil). Suppose you want the output in a different format or style, and you want only the city, not the country. Adding few-shot examples to your prompt can steer the model to respond in the way you want. Prompt Model response Determine the city along with the landmark. The landmark is the Christ the Redeemer statue in Rio de Janeiro, Brazil. Updated prompt Improved response Determine the city along with the landmark. city: Rome, landmark: the Colosseum. city: Beijing, landmark: Forbidden City city: Rio de Janeiro, landmark: Christ the Redeemer statue Break it down step-by-step For complex tasks like the ones that require both visual understanding and reasoning, it can be helpful to split the task into smaller, more straightforward steps. Alternatively, it could also be effective if you directly ask the model to “think step by step” in your prompt. Prompt Model response When will I run out of toilet paper? Soon, you only have 3 rolls left. Updated prompt Improved response 1. First, count how many toilet paper rolls are in this picture. 2. Then, determine how much toilet paper a typical person uses per day. 3. Calculate how long these rolls of toilet paper will last. 1. There are 3 rolls of toilet paper in this picture. 2. A typical person uses about 20 sheets of toilet paper per day. 3. If each roll contains 200 sheets, then each roll will last for about 10 days. Therefore, the 3 rolls will last for about a month. Math problems or other types of word problems are great candidates for asking the model to think step-by-step. Prompt Response What is the 4th term in the sequence? -135 The response from the model is incorrect. Some ways to improve this is to ask \ No newline at end of file diff --git a/docstore/7153096d-eaa8-4019-867d-991fd3b7a4a6 b/docstore/7153096d-eaa8-4019-867d-991fd3b7a4a6 new file mode 100644 index 0000000000000000000000000000000000000000..4b6418baecebd23eec6598a4eb723dc1516263bd --- /dev/null +++ b/docstore/7153096d-eaa8-4019-867d-991fd3b7a4a6 @@ -0,0 +1 @@ +default media resolution or 6 hours long at low media resolution, while models with a 1M context window can process videos up to 1 hour long at default media resolution or 3 hours long at low media resolution. File API processing : When using the File API, videos are sampled at 1 frame per second (FPS) and audio is processed at 1Kbps (single channel). Timestamps are added every second. These rates are subject to change in the future for improvements in inference. Token calculation : Each second of video is tokenized as follows: Individual frames (sampled at 1 FPS): If mediaResolution is set to low, frames are tokenized at 66 tokens per frame. Otherwise, frames are tokenized at 258 tokens per frame. Audio: 32 tokens per second. Metadata is also included. Total: Approximately 300 tokens per second of video at default media resolution, or 100 tokens per second of video at low media resolution. Timestamp format : When referring to specific moments in a video within your prompt, use the MM:SS format (e.g., 01:15 for 1 minute and 15 seconds). Best practices : Use only one video per prompt request for optimal results. If combining text and a single video, place the text prompt after the video part in the contents array. Be aware that fast action sequences might lose detail due to the 1 FPS sampling rate. Consider slowing down such clips if necessary. What's next This guide shows how to upload video files and generate text outputs from video inputs. To learn more, see the following resources: System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. Files API : Learn more about uploading and managing files for use with Gemini. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. \ No newline at end of file diff --git a/docstore/716aa1e8-763d-4986-af72-23cdea7487b9 b/docstore/716aa1e8-763d-4986-af72-23cdea7487b9 new file mode 100644 index 0000000000000000000000000000000000000000..40517314fd91c121847408df8a1f7fc600adf0b3 --- /dev/null +++ b/docstore/716aa1e8-763d-4986-af72-23cdea7487b9 @@ -0,0 +1 @@ +string, "nullable": boolean, "enum": [ string ], "maxItems": integer, "minItems": integer, "properties": { string: { object (Schema) }, ... }, "required": [ string ], "propertyOrdering": [ string ], "items": { object (Schema) } } The Type of the schema must be one of the OpenAPI Data Types , or a union of those types (using anyOf ). Only a subset of fields is valid for each Type . The following list maps each Type to a subset of the fields that are valid for that type: string -> enum , format , nullable integer -> format , minimum , maximum , enum , nullable number -> format , minimum , maximum , enum , nullable boolean -> nullable array -> minItems , maxItems , items , nullable object -> properties , required , propertyOrdering , nullable Here are some example schemas showing valid type-and-field combinations: { "type" : "string" , "enum" : [ "a" , "b" , "c" ] } { "type" : "string" , "format" : "date-time" } { "type" : "integer" , "format" : "int64" } { "type" : "number" , "format" : "double" } { "type" : "boolean" } { "type" : "array" , "minItems" : 3 , "maxItems" : 3 , "items" : { "type" : ... } } { "type" : "object" , "properties" : { "a" : { "type" : ... }, "b" : { "type" : ... }, "c" : { "type" : ... } }, "nullable" : true , "required" : [ "c" ], "propertyOrdering" : [ "c" , "b" , "a" ] } For complete documentation of the Schema fields as they're used in the Gemini API, see the Schema reference . Property ordering Warning: When you're configuring a JSON schema, make sure to set propertyOrdering[] , and when you provide examples, make sure that the property ordering in the examples matches the schema. When you're working with JSON schemas in the Gemini API, the order of properties is important. By default, the API orders properties alphabetically and does not preserve the order in which the properties are defined (although the Google Gen AI SDKs may preserve this order). If you're providing examples to the model with a schema configured, and the property \ No newline at end of file diff --git a/docstore/71bcac5a-e156-4c94-b248-c48f05640ca3 b/docstore/71bcac5a-e156-4c94-b248-c48f05640ca3 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/71bcac5a-e156-4c94-b248-c48f05640ca3 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/71de11b6-9c94-4b79-ba12-1532cbe1730d b/docstore/71de11b6-9c94-4b79-ba12-1532cbe1730d new file mode 100644 index 0000000000000000000000000000000000000000..ebc8fdc5ad27fd96758924c177eadfccc4d6556f --- /dev/null +++ b/docstore/71de11b6-9c94-4b79-ba12-1532cbe1730d @@ -0,0 +1 @@ +Structured output | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Structured output You can configure Gemini for structured output instead of unstructured text, allowing precise extraction and standardization of information for further processing. For example, you can use structured output to extract information from resumes, standardize them to build a structured database. Gemini can generate either JSON or enum values as structured output. Generating JSON There are two ways to generate JSON using the Gemini API: Configure a schema on the model Provide a schema in a text prompt Configuring a schema on the model is the recommended way to generate JSON, because it constrains the model to output JSON. Configuring a schema (recommended) To constrain the model to generate JSON, configure a responseSchema . The model will then respond to any prompt with JSON-formatted output. Python from google import genai from pydantic import BaseModel class Recipe ( BaseModel ): recipe_name : str ingredients : list [ str ] client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "List a few popular cookie recipes, and include the amounts of ingredients." , config = { "response_mime_type" : "application/json" , "response_schema" : list [ Recipe ], }, ) # Use the response as a JSON string. print ( response . text ) # Use instantiated objects. my_recipes : list [ Recipe ] = response . parsed Note: Pydantic validators are not yet supported. If a pydantic.ValidationError occurs, it is suppressed, and .parsed may be empty/null. JavaScript import { GoogleGenAI , Type } from "@google/genai" ; const ai = \ No newline at end of file diff --git a/docstore/71dfe8ad-a891-456b-974c-553eb964cf97 b/docstore/71dfe8ad-a891-456b-974c-553eb964cf97 new file mode 100644 index 0000000000000000000000000000000000000000..03e316e434a13c2a6804dc8cff96f196f07c7e52 --- /dev/null +++ b/docstore/71dfe8ad-a891-456b-974c-553eb964cf97 @@ -0,0 +1 @@ +the sum of the first 50 prime numbers. Here's how I'll approach this: 1. **Generate Prime Numbers:** I'll use an iterative method to find prime numbers. I'll start with 2 and check if each subsequent number is divisible by any number between 2 and its square root. If not, it's a prime. 2. **Store Primes:** I'll store the prime numbers in a list until I have 50 of them. 3. **Calculate the Sum:** Finally, I'll sum the prime numbers in the list. Here's the Python code to do this: def is_prime(n): """Efficiently checks if a number is prime.""" if n <= 1: return False if n <= 3: return True if n % 2 == 0 or n % 3 == 0: return False i = 5 while i * i <= n: if n % i == 0 or n % (i + 2) == 0: return False i += 6 return True primes = [] num = 2 while len(primes) < 50: if is_prime(num): primes.append(num) num += 1 sum_of_primes = sum(primes) print(f'{primes=}') print(f'{sum_of_primes=}') primes=[2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229] sum_of_primes=5117 The sum of the first 50 prime numbers is 5117. This output combines several content parts that the model returns when using code execution: text : Inline text generated by the model executableCode : Code generated by the model that is meant to be executed codeExecutionResult : Result of the executable code The naming conventions for these parts vary by programming language. Use code execution in chat You can also use code execution as part of a chat. Python from google import genai from google.genai import types client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" , config = types . GenerateContentConfig ( tools = [ types . Tool ( code_execution = types . ToolCodeExecution )] ), ) response = chat . send_message ( "I have a math question for you." ) print ( response . text ) response = chat . send_message ( "What is \ No newline at end of file diff --git a/docstore/71f328af-b547-447b-abe4-ffe19d0ef006 b/docstore/71f328af-b547-447b-abe4-ffe19d0ef006 new file mode 100644 index 0000000000000000000000000000000000000000..9a3ae8e54d036eb9d08cf51953b4e3479c03ffae --- /dev/null +++ b/docstore/71f328af-b547-447b-abe4-ffe19d0ef006 @@ -0,0 +1 @@ +Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/722f61a1-f892-429e-a846-2bf6603f8c03 b/docstore/722f61a1-f892-429e-a846-2bf6603f8c03 new file mode 100644 index 0000000000000000000000000000000000000000..805d10be613a3c55006267c8f5d9ce17fbca450d --- /dev/null +++ b/docstore/722f61a1-f892-429e-a846-2bf6603f8c03 @@ -0,0 +1 @@ +declaration can include the following parameters: name (string): A unique name for the function ( get_weather_forecast , send_email ). Use descriptive names without spaces or special characters (use underscores or camelCase). description (string): A clear and detailed explanation of the function's purpose and capabilities. This is crucial for the model to understand when to use the function. Be specific and provide examples if helpful ("Finds theaters based on location and optionally movie title which is currently playing in theaters."). parameters (object): Defines the input parameters the function expects. type (string): Specifies the overall data type, such as object . properties (object): Lists individual parameters, each with: type (string): The data type of the parameter, such as string , integer , boolean, array . description (string): A description of the parameter's purpose and format. Provide examples and constraints ("The city and state, e.g., 'San Francisco, CA' or a zip code e.g., '95616'."). enum (array, optional): If the parameter values are from a fixed set, use "enum" to list the allowed values instead of just describing them in the description. This improves accuracy ("enum": ["daylight", "cool", "warm"]). required (array): An array of strings listing the parameter names that are mandatory for the function to operate. Function calling with thinking Enabling "thinking" can improve function call performance by allowing the model to reason through a request before suggesting function calls. However, because the Gemini API is stateless, this reasoning context is lost between turns, which can reduce the quality of function calls as they require multiple turn requests. To preserve this context you can use thought signatures. A thought signature is an encrypted representation of the model's internal thought process that you pass back to the model on subsequent turns. To use thought signatures: Receive the signature: When thinking is enabled, the API \ No newline at end of file diff --git a/docstore/725f2c46-6cab-4fd8-b997-94d69fbd137a b/docstore/725f2c46-6cab-4fd8-b997-94d69fbd137a new file mode 100644 index 0000000000000000000000000000000000000000..8d1e7e8a7a566917639b9515f69a698a37f27a8d --- /dev/null +++ b/docstore/725f2c46-6cab-4fd8-b997-94d69fbd137a @@ -0,0 +1 @@ +to avoid. ⛔ Negative pattern: Example Don't end haikus with a question: Haiku are fun A short and simple poem Don't you enjoy them? ✅ Positive pattern: Example Always end haikus with an assertion: Haiku are fun A short and simple poem A joy to write Consistent formatting Make sure that the structure and formatting of few-shot examples are the same to avoid responses with undesired formats. One of the primary objectives of adding few-shot examples in prompts is to show the model the response format. Therefore, it is essential to ensure a consistent format across all examples, especially paying attention to XML tags, white spaces, newlines, and example splitters. Add context You can include instructions and information in a prompt that the model needs to solve a problem, instead of assuming that the model has all of the required information. This contextual information helps the model understand the constraints and details of what you're asking for it to do. The following example asks the model to give troubleshooting guidance for a router: Prompt: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Response: A yellow blinking light on your Google Wifi router usually indicates a problem with the internet connection. Here are some steps to troubleshoot and fix the issue: 1. Restart your router and modem: - Unplug your modem from the power outlet and wait for 30 seconds. - Plug your modem back in and wait for it to fully boot up. - Unplug your Google Wifi router and wait for 30 seconds. - Plug your Google Wifi router back in and wait for it to fully boot up. ..... (gemini-2.5-flash) The response looks like generic troubleshooting information that's not specific to the router or the status of the LED indicator lights. To customize the response for the specific router, you can add to the prompt the router's troubleshooting guide as context for it to refer to when providing a response. Prompt: Answer the question \ No newline at end of file diff --git a/docstore/726a4431-d6a3-463e-83f9-f6b397167be7 b/docstore/726a4431-d6a3-463e-83f9-f6b397167be7 new file mode 100644 index 0000000000000000000000000000000000000000..83e8a7f39a569661ceb51609e03cd9ce9f516cda --- /dev/null +++ b/docstore/726a4431-d6a3-463e-83f9-f6b397167be7 @@ -0,0 +1 @@ +will only show up for projects that meet next tier qualifications . After a quick validation, the project will be upgraded to the next tier. Request a rate limit increase Each model variation has an associated rate limit (requests per minute, RPM). For details on those rate limits, see Gemini models . Request paid tier rate limit increase We offer no guarantees about increasing your rate limit, but we'll do our best to review your request and reach out to you if we're able to accommodate your capacity needs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/docstore/7273a98c-f712-491b-893b-cb80bf03e948 b/docstore/7273a98c-f712-491b-893b-cb80bf03e948 new file mode 100644 index 0000000000000000000000000000000000000000..18e5380dd4144398b3d4c6273920669cbc2b0130 --- /dev/null +++ b/docstore/7273a98c-f712-491b-893b-cb80bf03e948 @@ -0,0 +1 @@ +'gemini-2.0-flash' , contents = 'Tell me a story in 100 words.' , config = types . GenerateContentConfig ( system_instruction = 'you are a story teller for kids under 5 years old' , max_output_tokens = 400 , top_k = 2 , top_p = 0.5 , temperature = 0.5 , response_mime_type = 'application/json' , stop_sequences = [ ' \n ' ], seed = 42 , ), ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Tell me a story about a magic backpack." , config : { candidateCount : 1 , stopSequences : [ "x" ], maxOutputTokens : 20 , temperature : 1.0 , }, }); console . log ( response . text ); Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , genai . Text ( "Tell me about New York" ), & genai . GenerateContentConfig { Temperature : genai . Ptr [ float32 ]( 0.5 ), TopP : genai . Ptr [ float32 ]( 0.5 ), TopK : genai . Ptr [ float32 ]( 2.0 ), ResponseMIMEType : "application/json" , StopSequences : [] string { "Yankees" }, CandidateCount : 2 , Seed : genai . Ptr [ int32 ]( 42 ), MaxOutputTokens : 128 , PresencePenalty : genai . Ptr [ float32 ]( 0.5 ), FrequencyPenalty : genai . Ptr [ float32 ]( 0.5 ), }, ) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing response Safety settings Generate a response with safety settings: Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ( 'say something bad' , safety_settings = { 'HATE' : 'BLOCK_ONLY_HIGH' , 'HARASSMENT' : 'BLOCK_ONLY_HIGH' , } ) JavaScript import { GoogleGenerativeAI , HarmCategory , HarmBlockThreshold } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI \ No newline at end of file diff --git a/docstore/7279ceb6-2db4-4adb-aac7-477745196e55 b/docstore/7279ceb6-2db4-4adb-aac7-477745196e55 new file mode 100644 index 0000000000000000000000000000000000000000..56f7dcc9bceb6e3744499a44a4d77c57b76426e8 --- /dev/null +++ b/docstore/7279ceb6-2db4-4adb-aac7-477745196e55 @@ -0,0 +1 @@ += "gemini-2.5-flash" , contents = [ "Explain how AI works" ], config = types . GenerateContentConfig ( temperature = 0.1 ) ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , config : { temperature : 0.1 , }, }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } temp := float32 ( 0.9 ) topP := float32 ( 0.5 ) topK := float32 ( 20.0 ) config := & genai . GenerateContentConfig { Temperature : & temp , TopP : & topP , TopK : & topK , ResponseMIMEType : "application/json" , } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "What is the average size of a swallow?" ), config , ) fmt . Println ( result . Text ()) } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ], "generationConfig": { "stopSequences": [ "Title" ], "temperature": 1.0, "topP": 0.8, "topK": 10 } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const generationConfig = { temperature : 1 , topP : 0.95 , topK : 40 , responseMimeType : 'text/plain' , }; const payload = { generationConfig , contents : [ { parts : [ { text : 'Explain how AI works in a few words' }, ], }, ], }; const url = \ No newline at end of file diff --git a/docstore/727a21a4-f2dc-47dc-a25d-ce33a8a947f1 b/docstore/727a21a4-f2dc-47dc-a25d-ce33a8a947f1 new file mode 100644 index 0000000000000000000000000000000000000000..6d0fb3dda30537d366b10ce141412a20984aa796 --- /dev/null +++ b/docstore/727a21a4-f2dc-47dc-a25d-ce33a8a947f1 @@ -0,0 +1 @@ +where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels. """ config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # set thinking_budget to 0 for better results in object detection ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ prompt , im ], # Pillow images can be directly passed as inputs (which will be converted by the SDK) config = config ) # Parse JSON response items = json . loads ( parse_json ( response . text )) # Create output directory os . makedirs ( output_dir , exist_ok = True ) # Process each mask for i , item in enumerate ( items ): # Get bounding box coordinates box = item [ "box_2d" ] y0 = int ( box [ 0 ] / 1000 * im . size [ 1 ]) x0 = int ( box [ 1 ] / 1000 * im . size [ 0 ]) y1 = int ( box [ 2 ] / 1000 * im . size [ 1 ]) x1 = int ( box [ 3 ] / 1000 * im . size [ 0 ]) # Skip invalid boxes if y0 > = y1 or x0 > = x1 : continue # Process mask png_str = item [ "mask" ] if not png_str . startswith ( "data:image/png;base64," ): continue # Remove prefix png_str = png_str . removeprefix ( "data:image/png;base64," ) mask_data = base64 . b64decode ( png_str ) mask = Image . open ( io . BytesIO ( mask_data )) # Resize mask to match bounding box mask = mask . resize (( x1 - x0 , y1 - y0 ), Image . Resampling . BILINEAR ) # Convert mask to numpy array for processing mask_array = np . array ( mask ) # Create overlay for this mask overlay = Image . new ( 'RGBA' , im . size , ( 0 , 0 , 0 , 0 )) overlay_draw = ImageDraw . Draw ( overlay ) # Create overlay for the mask color = ( 255 , 255 , 255 , 200 ) for y in range ( y0 , y1 ): for x in range ( x0 , x1 ): if mask_array [ y - y0 , x - x0 ] > 128 : # Threshold for mask overlay_draw . point (( x , y ), fill = color ) # Save individual mask and its overlay mask_filename = f " { item [ 'label' ] } _ { i } _mask.png" \ No newline at end of file diff --git a/docstore/7284e0b4-41eb-4c43-a038-c723867d8b62 b/docstore/7284e0b4-41eb-4c43-a038-c723867d8b62 new file mode 100644 index 0000000000000000000000000000000000000000..c3dd210d4957ccdbb55df147cb99efb49a9932b2 --- /dev/null +++ b/docstore/7284e0b4-41eb-4c43-a038-c723867d8b62 @@ -0,0 +1 @@ +tokens Context caching price Not available $0.3125, prompts <= 128k tokens $0.625, prompts > 128k tokens Context caching (storage) Not available $4.50 per hour Tuning price Not available Not available Grounding with Google Search Not available $35 / 1K grounding requests Used to improve our products Yes No Text Embedding 004 Our state-of-the-art text embedding model. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge Not available Output price Free of charge Not available Tuning price Not available Not available Used to improve our products Yes No [*] Google AI Studio usage is free of charge in all available regions . See Billing FAQs for details. [**] Prices may differ from the prices listed here and the prices offered on Vertex AI. For Vertex prices, see the Vertex AI pricing page . [***] If you are using dynamic retrieval to optimize costs, only requests that contain at least one grounding support URL from the web in their response are charged for Grounding with Google Search. Costs for Gemini always apply. Rate limits are subject to change. Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/72b3006b-68d9-44aa-b9d8-1b868a05cb3d b/docstore/72b3006b-68d9-44aa-b9d8-1b868a05cb3d new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/72b3006b-68d9-44aa-b9d8-1b868a05cb3d @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/72d1a0e8-7a56-45e9-aefe-355d51547aca b/docstore/72d1a0e8-7a56-45e9-aefe-355d51547aca new file mode 100644 index 0000000000000000000000000000000000000000..76ace95fe58e8b11b8d859b0314962fe72bedd09 --- /dev/null +++ b/docstore/72d1a0e8-7a56-45e9-aefe-355d51547aca @@ -0,0 +1 @@ +() { const payload = { contents : [ { parts : [ { text : 'Explain how AI works' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Multi-turn conversations (Chat) Our SDKs provide functionality to collect multiple rounds of prompts and responses into a chat, giving you an easy way to keep track of the conversation history. Note: Chat functionality is only implemented as part of the SDKs. Behind the scenes, it still uses the generateContent API. For multi-turn conversations, the full conversation history is sent to the model with each follow-up turn. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message ( "I have 2 dogs in my house." ) print ( response . text ) response = chat . send_message ( "How many paws are in my house?" ) print ( response . text ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const response1 = await chat . sendMessage ({ message : "I have 2 dogs in my house." , }); console . log ( "Chat response 1:" , response1 . text ); const response2 = await chat . sendMessage ({ message : "How many paws are in \ No newline at end of file diff --git a/docstore/72fb4705-4cb3-4bf7-ac8a-f15940e7b5f3 b/docstore/72fb4705-4cb3-4bf7-ac8a-f15940e7b5f3 new file mode 100644 index 0000000000000000000000000000000000000000..13dae0738dbe1cc658da4f8d80ffe7f33c50362c --- /dev/null +++ b/docstore/72fb4705-4cb3-4bf7-ac8a-f15940e7b5f3 @@ -0,0 +1 @@ +(Multimodal Live API) Models supported All Gemini 2.0 and 2.5 models Only Flash experimental models File input types supported .png, .jpeg, .csv, .xml, .cpp, .java, .py, .js, .ts .png, .jpeg, .csv, .xml, .cpp, .java, .py, .js, .ts Plotting libraries supported Matplotlib, seaborn Matplotlib, seaborn Multi-tool use Yes (code execution + grounding only) Yes Billing There's no additional charge for enabling code execution from the Gemini API. You'll be billed at the current rate of input and output tokens based on the Gemini model you're using. Here are a few other things to know about billing for code execution: You're only billed once for the input tokens you pass to the model, and you're billed for the final output tokens returned to you by the model. Tokens representing generated code are counted as output tokens. Generated code can include text and multimodal output like images. Code execution results are also counted as output tokens. The billing model is shown in the following diagram: You're billed at the current rate of input and output tokens based on the Gemini model you're using. If Gemini uses code execution when generating your response, the original prompt, the generated code, and the result of the executed code are labeled intermediate tokens and are billed as input tokens . Gemini then generates a summary and returns the generated code, the result of the executed code, and the final summary. These are billed as output tokens . The Gemini API includes an intermediate token count in the API response, so you know why you're getting additional input tokens beyond your initial prompt. Limitations The model can only generate and execute code. It can't return other artifacts like media files. In some cases, enabling code execution can lead to regressions in other areas of model output (for example, writing a story). There is some variation in the ability of the different models to use code execution successfully. Supported libraries The code execution \ No newline at end of file diff --git a/docstore/72ffbb78-cf12-4495-86e2-e3ed24e7f6a2 b/docstore/72ffbb78-cf12-4495-86e2-e3ed24e7f6a2 new file mode 100644 index 0000000000000000000000000000000000000000..485847fd8e226bc46bd8d42c44cd3e8dd100fb7e --- /dev/null +++ b/docstore/72ffbb78-cf12-4495-86e2-e3ed24e7f6a2 @@ -0,0 +1 @@ +supported on the interactive (or non-batch mode) API. Pricing: Batch Mode usage is priced at 50% of the standard interactive API cost for the equivalent model. Service Level Objective (SLO): Batch jobs are designed to complete within a 24-hour turnaround time. Many jobs may complete much faster depending on their size and current system load. Caching: Context caching is enabled for batch requests. If a request in your batch results in a cache hit, the cached tokens are priced the same as for non-batch mode traffic. Best practices Use input files for large requests: For a large number of requests, always use the file input method for better manageability and to avoid hitting request size limits for the BatchGenerateContent call itself. Note that there's a the 2GB file size limit per input file. Error handling: Check the batchStats for failedRequestCount after a job completes. If using file output, parse each line to check if it's a GenerateContentResponse or a status object indicating an error for that specific request. Submit jobs once: The creation of a batch job is not idempotent. If you send the same creation request twice, two separate batch jobs will be created. Break up very large batches: While the target turnaround time is 24 hours, actual processing time can vary based on system load and job size. For large jobs, consider breaking them into smaller batches if intermediate results are needed sooner. What's next Check out the batch mode notebook for more examples. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/73120d89-847b-48db-ac01-8d6e044efcbb b/docstore/73120d89-847b-48db-ac01-8d6e044efcbb new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/73120d89-847b-48db-ac01-8d6e044efcbb @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/731e930c-f29c-45e4-93cb-fd4f6fa55c12 b/docstore/731e930c-f29c-45e4-93cb-fd4f6fa55c12 new file mode 100644 index 0000000000000000000000000000000000000000..9a406d6d652b4766d7f38fc77bc77aa7dbb4036f --- /dev/null +++ b/docstore/731e930c-f29c-45e4-93cb-fd4f6fa55c12 @@ -0,0 +1 @@ +Context caching | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Context caching Python JavaScript Go REST In a typical AI workflow, you might pass the same input tokens over and over to a model. The Gemini API offers two different caching mechanisms: Implicit caching (automatically enabled on Gemini 2.5 models, no cost saving guarantee) Explicit caching (can be manually enabled on most models, cost saving guarantee) Explicit caching is useful in cases where you want to guarantee cost savings, but with some added developer work. Implicit caching Implicit caching is enabled by default for all Gemini 2.5 models. We automatically pass on cost savings if your request hits caches. There is nothing you need to do in order to enable this. It is effective as of May 8th, 2025. The minimum input token count for context caching is 1,024 for 2.5 Flash and 2,048 for 2.5 Pro. To increase the chance of an implicit cache hit: Try putting large and common contents at the beginning of your prompt Try to send requests with similar prefix in a short amount of time You can see the number of tokens which were cache hits in the response object's usage_metadata field. Explicit caching Using the Gemini API explicit caching feature, you can pass some content to the model once, cache the input tokens, and then refer to the cached tokens for subsequent requests. At certain volumes, using cached tokens is lower cost than passing in the same corpus of tokens repeatedly. When you cache a set of tokens, you can choose how long you want the cache to exist before the tokens are automatically deleted. This caching duration is called the time to live (TTL). If not set, \ No newline at end of file diff --git a/docstore/732dde51-1ec0-4252-b2db-2a6841e94851 b/docstore/732dde51-1ec0-4252-b2db-2a6841e94851 new file mode 100644 index 0000000000000000000000000000000000000000..a3f65441166ab1d39d6f723c5ce4c04b54ac95e6 --- /dev/null +++ b/docstore/732dde51-1ec0-4252-b2db-2a6841e94851 @@ -0,0 +1 @@ +values, use an enum. Tool Selection: While the model can use an arbitrary number of tools, providing too many can increase the risk of selecting an incorrect or suboptimal tool. For best results, aim to provide only the relevant tools for the context or task, ideally keeping the active set to a maximum of 10-20. Consider dynamic tool selection based on conversation context if you have a large total number of tools. Prompt Engineering: Provide context: Tell the model its role (e.g., "You are a helpful weather assistant."). Give instructions: Specify how and when to use functions (e.g., "Don't guess dates; always use a future date for forecasts."). Encourage clarification: Instruct the model to ask clarifying questions if needed. Temperature: Use a low temperature (e.g., 0) for more deterministic and reliable function calls. Validation: If a function call has significant consequences (e.g., placing an order), validate the call with the user before executing it. Error Handling : Implement robust error handling in your functions to gracefully handle unexpected inputs or API failures. Return informative error messages that the model can use to generate helpful responses to the user. Security: Be mindful of security when calling external APIs. Use appropriate authentication and authorization mechanisms. Avoid exposing sensitive data in function calls. Token Limits: Function descriptions and parameters count towards your input token limit. If you're hitting token limits, consider limiting the number of functions or the length of the descriptions, break down complex tasks into smaller, more focused function sets. Notes and limitations Only a subset of the OpenAPI schema is supported. Supported parameter types in Python are limited. Automatic function calling is a Python SDK feature only. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License \ No newline at end of file diff --git a/docstore/7352bc60-2b51-431f-956c-af6206d5ca60 b/docstore/7352bc60-2b51-431f-956c-af6206d5ca60 new file mode 100644 index 0000000000000000000000000000000000000000..90311e83465fc930174e1787a7822757d628fc0a --- /dev/null +++ b/docstore/7352bc60-2b51-431f-956c-af6206d5ca60 @@ -0,0 +1 @@ +setLightValues ( tool_call . args . brightness , tool_call . args . color_temp ); console . log ( `Function execution result: ${ JSON . stringify ( result ) } ` ); } Step 4: Create user friendly response with function result and call the model again Finally, send the result of the function execution back to the model so it can incorporate this information into its final response to the user. Python # Create a function response part function_response_part = types . Part . from_function_response ( name = tool_call . name , response = { "result" : result }, ) # Append function call and result of the function execution to contents contents . append ( response . candidates [ 0 ] . content ) # Append the content from the model's response. contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Create a function response part const function_response_part = { name : tool_call . name , response : { result } } // Append function call and result of the function execution to contents contents . push ( response . candidates [ 0 ]. content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); // Get the final response from the model const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); This completes the function calling flow. The model successfully used the set_light_values function to perform the request action of the user. Function declarations When you implement function calling in a prompt, you create a tools object, which contains one or more function declarations . You define functions using JSON, specifically with a select subset of the OpenAPI schema format. A single function \ No newline at end of file diff --git a/docstore/736d8b4e-f54a-4597-b1d6-622f681376fd b/docstore/736d8b4e-f54a-4597-b1d6-622f681376fd new file mode 100644 index 0000000000000000000000000000000000000000..02741f017de0a4f2326222e7e2c63ce436f783ef --- /dev/null +++ b/docstore/736d8b4e-f54a-4597-b1d6-622f681376fd @@ -0,0 +1 @@ +unsafe prompt." ), config , ) if err != nil { log . Fatal ( err ) } fmt . Println ( response . Text ()) } JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const safetySettings = [ { category : "HARM_CATEGORY_HARASSMENT" , threshold : "BLOCK_LOW_AND_ABOVE" , }, { category : "HARM_CATEGORY_HATE_SPEECH" , threshold : "BLOCK_LOW_AND_ABOVE" , }, ]; async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Some potentially unsafe prompt." , config : { safetySettings : safetySettings , }, }); console . log ( response . text ); } await main (); Dart (Flutter) final safetySettings = [ SafetySetting ( HarmCategory . harassment , HarmBlockThreshold . low ), SafetySetting ( HarmCategory . hateSpeech , HarmBlockThreshold . low ), ]; final model = GenerativeModel ( model: 'gemini-1.5-flash' , apiKey: apiKey , safetySettings: safetySettings , ); Kotlin val harassmentSafety = SafetySetting ( HarmCategory . HARASSMENT , BlockThreshold . LOW_AND_ABOVE ) val hateSpeechSafety = SafetySetting ( HarmCategory . HATE_SPEECH , BlockThreshold . LOW_AND_ABOVE ) val generativeModel = GenerativeModel ( modelName = "gemini-1.5-flash" , apiKey = BuildConfig . apiKey , safetySettings = listOf ( harassmentSafety , hateSpeechSafety ) ) Java SafetySetting harassmentSafety = new SafetySetting ( HarmCategory . HARASSMENT , BlockThreshold . LOW_AND_ABOVE ); SafetySetting hateSpeechSafety = new SafetySetting ( HarmCategory . HATE_SPEECH , BlockThreshold . LOW_AND_ABOVE ); GenerativeModel gm = new GenerativeModel ( "gemini-1.5-flash" , BuildConfig . apiKey , null , // generation config is optional Arrays . asList ( harassmentSafety , hateSpeechSafety ) ); GenerativeModelFutures model = GenerativeModelFutures . from ( gm ); REST echo '{ "safetySettings": [ {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_ONLY_HIGH"}, {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": \ No newline at end of file diff --git a/docstore/737d68cf-889a-47e2-99a9-25ee6ea73ed9 b/docstore/737d68cf-889a-47e2-99a9-25ee6ea73ed9 new file mode 100644 index 0000000000000000000000000000000000000000..fa9f25d212859f977afd73b3d8fec1e0a96d893a --- /dev/null +++ b/docstore/737d68cf-889a-47e2-99a9-25ee6ea73ed9 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/migrate#main-content Title: Migrate to the Google GenAI SDK | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/73967c23-ab92-4e2a-90e0-ed226b47dd19 b/docstore/73967c23-ab92-4e2a-90e0-ed226b47dd19 new file mode 100644 index 0000000000000000000000000000000000000000..42fbfa8d3a1b9c27b4f54909cff17ace224a9de6 --- /dev/null +++ b/docstore/73967c23-ab92-4e2a-90e0-ed226b47dd19 @@ -0,0 +1 @@ +over a happy ' 'futuristic scifi city with lots of greenery?' ) response = client . models . generate_content ( model = "gemini-2.0-flash-preview-image-generation" , contents = contents , config = types . GenerateContentConfig ( response_modalities = [ 'TEXT' , 'IMAGE' ] ) ) for part in response . candidates [ 0 ] . content . parts : if part . text is not None : print ( part . text ) elif part . inline_data is not None : image = Image . open ( BytesIO (( part . inline_data . data ))) image . save ( 'gemini-native-image.png' ) image . show () JavaScript Note: We've released the Google SDK for TypeScript and JavaScript in preview launch stage . Use this SDK for image generation features. import { GoogleGenAI , Modality } from "@google/genai" ; import * as fs from "node:fs" ; async function main () { const ai = new GoogleGenAI ({}); const contents = "Hi, can you create a 3d rendered image of a pig " + "with wings and a top hat flying over a happy " + "futuristic scifi city with lots of greenery?" ; // Set responseModalities to include "Image" so the model can generate an image const response = await ai . models . generateContent ({ model : "gemini-2.0-flash-preview-image-generation" , contents : contents , config : { responseModalities : [ Modality . TEXT , Modality . IMAGE ], }, }); for ( const part of response . candidates [ 0 ]. content . parts ) { // Based on the part type, either show the text or save the image if ( part . text ) { console . log ( part . text ); } else if ( part . inlineData ) { const imageData = part . inlineData . data ; const buffer = Buffer . from ( imageData , "base64" ); fs . writeFileSync ( "gemini-native-image.png" , buffer ); console . log ( "Image saved as gemini-native-image.png" ); } } } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . \ No newline at end of file diff --git a/docstore/73b6fc95-171c-46bb-a071-8eaac52304a8 b/docstore/73b6fc95-171c-46bb-a071-8eaac52304a8 new file mode 100644 index 0000000000000000000000000000000000000000..a97f8e9b4e6ce1d85f8c4b9bed31a9676bb7c602 --- /dev/null +++ b/docstore/73b6fc95-171c-46bb-a071-8eaac52304a8 @@ -0,0 +1 @@ +Files API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Files API The Gemini family of artificial intelligence (AI) models is built to handle various types of input data, including text, images, and audio. Since these models can handle more than one type or mode of data, the Gemini models are called multimodal models or explained as having multimodal capabilities . This guide shows you how to work with media files using the Files API. The basic operations are the same for audio files, images, videos, documents, and other supported file types. For file prompting guidance, check out the File prompt guide section. Upload a file You can use the Files API to upload a media file. Always use the Files API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20 MB. The following code uploads a file and then uses the file in a call to generateContent . Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp3" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ "Describe this audio clip" , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Describe this audio \ No newline at end of file diff --git a/docstore/73c02a2d-05c1-4485-bc39-b96008bd3a41 b/docstore/73c02a2d-05c1-4485-bc39-b96008bd3a41 new file mode 100644 index 0000000000000000000000000000000000000000..c95ce8529f78ed9807c80ac97da2c9c530df9edf --- /dev/null +++ b/docstore/73c02a2d-05c1-4485-bc39-b96008bd3a41 @@ -0,0 +1 @@ +GenerateContentConfig { ResponseModalities : [] string { "TEXT" , "IMAGE" }, } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash-preview-image-generation" , genai . Text ( "Hi, can you create a 3d rendered image of a pig " + "with wings and a top hat flying over a happy " + "futuristic scifi city with lots of greenery?" ), config , ) for _ , part := range result . Candidates [ 0 ]. Content . Parts { if part . Text != "" { fmt . Println ( part . Text ) } else if part . InlineData != nil { imageBytes := part . InlineData . Data outputFilename := "gemini_generated_image.png" _ = os . WriteFile ( outputFilename , imageBytes , 0644 ) } } } REST curl -s -X POST "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-preview-image-generation:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts": [ {"text": "Hi, can you create a 3d rendered image of a pig with wings and a top hat flying over a happy futuristic scifi city with lots of greenery?"} ] }], "generationConfig":{"responseModalities":["TEXT","IMAGE"]} }' \ | grep -o '"data": "[^"]*"' \ | cut -d '"' -f4 \ | base64 --decode > gemini-native-image.png AI-generated image of a fantastical flying pig Image editing (text-and-image-to-image) To perform image editing, add an image as input. The following example demonstrates uploading base64 encoded images. For multiple images and larger payloads, check the image input section. Python from google import genai from google.genai import types from PIL import Image from io import BytesIO import PIL.Image image = PIL . Image . open ( '/path/to/image.png' ) client = genai . Client () text_input = ( 'Hi, This is a picture of me.' 'Can you add a llama next to me?' ,) response = client . models . generate_content ( model = "gemini-2.0-flash-preview-image-generation" , contents = [ text_input , image ], config = types . GenerateContentConfig ( response_modalities = [ 'TEXT' \ No newline at end of file diff --git a/docstore/73c5abd2-87a2-4409-8f20-2880bd170e31 b/docstore/73c5abd2-87a2-4409-8f20-2880bd170e31 new file mode 100644 index 0000000000000000000000000000000000000000..5a0ee33e48eac2dd88a2e9ea3021e166f0794c9f --- /dev/null +++ b/docstore/73c5abd2-87a2-4409-8f20-2880bd170e31 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/downloads#install Title: Gemini API libraries | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/73d0db92-ffd7-4ba8-a752-77231cc234a6 b/docstore/73d0db92-ffd7-4ba8-a752-77231cc234a6 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/73d0db92-ffd7-4ba8-a752-77231cc234a6 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/73dd6644-239b-4754-ba3f-51801ae64b4b b/docstore/73dd6644-239b-4754-ba3f-51801ae64b4b new file mode 100644 index 0000000000000000000000000000000000000000..651124da3927c31f504e3c50b4f98f32fbef29df --- /dev/null +++ b/docstore/73dd6644-239b-4754-ba3f-51801ae64b4b @@ -0,0 +1 @@ +chat . send_message ( message = 'What happened after that?' ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const chat = ai . chats . create ({ model : "gemini-2.0-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const response1 = await chat . sendMessage ({ message : "I have 2 dogs in my house." , }); console . log ( "Chat response 1:" , response1 . text ); const response2 = await chat . sendMessage ({ message : "How many paws are in my house?" , }); console . log ( "Chat response 2:" , response2 . text ); Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } chat , err := client . Chats . Create ( ctx , "gemini-2.0-flash" , nil , nil ) if err != nil { log . Fatal ( err ) } result , err := chat . SendMessage ( ctx , genai . Part { Text : "Hello, I have 2 dogs in my house." }) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing result result , err = chat . SendMessage ( ctx , genai . Part { Text : "How many paws are in my house?" }) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing result Function calling Before Python import google.generativeai as genai from enum import Enum def get_current_weather ( location : str ) - > str : """Get the current whether in a given location. Args: location: required, The city and state, e.g. San Franciso, CA unit: celsius or fahrenheit """ print ( f 'Called with: { location =} ' ) return "23C" model = genai . GenerativeModel ( model_name = "gemini-1.5-flash" , tools = [ get_current_weather ] ) response = model . generate_content ( "What is the weather in San Francisco?" ) function_call = response . candidates [ 0 ] . parts [ 0 ] . function_call After Python In the new SDK, automatic function calling is the \ No newline at end of file diff --git a/docstore/73e0d13a-e165-47cb-ae69-b25d939706e3 b/docstore/73e0d13a-e165-47cb-ae69-b25d939706e3 new file mode 100644 index 0000000000000000000000000000000000000000..54ee11bfb756db29fb776eb5a6d4247407dfa205 --- /dev/null +++ b/docstore/73e0d13a-e165-47cb-ae69-b25d939706e3 @@ -0,0 +1 @@ +(Default/Some Thinking): Many common requests benefit from a degree of step-by-step processing or deeper understanding. Gemini can flexibly use thinking capability for tasks like: Analogize photosynthesis and growing up. Compare and contrast electric cars and hybrid cars. Hard Tasks (Maximum Thinking Capability): For truly complex challenges, such as solving complex math problems or coding tasks, we recommend setting a high thinking budget. These types of tasks require the model to engage its full reasoning and planning capabilities, often involving many internal steps before providing an answer. Examples include: Solve problem 1 in AIME 2025: Find the sum of all integer bases b > 9 for which 17 b is a divisor of 97 b . Write Python code for a web application that visualizes real-time stock market data, including user authentication. Make it as efficient as possible. Thinking with tools and capabilities Thinking models work with all of Gemini's tools and capabilities. This allows the models to interact with external systems, execute code, or access real-time information, incorporating the results into their reasoning and final response. The search tool allows the model to query Google Search to find up-to-date information or information beyond its training data. This is useful for questions about recent events or highly specific topics. The code execution tool enables the model to generate and run Python code to perform calculations, manipulate data, or solve problems that are best handled algorithmically. The model receives the code's output and can use it in its response. With structured output , you can constrain Gemini to respond with JSON. This is particularly useful for integrating the model's output into applications. Function calling connects the thinking model to external tools and APIs, so it can reason about when to call the right function and what parameters to provide. URL Context provides the model with URLs as additional context for your prompt. The \ No newline at end of file diff --git a/docstore/73ea86ec-b9a2-4a79-9508-6268829054f9 b/docstore/73ea86ec-b9a2-4a79-9508-6268829054f9 new file mode 100644 index 0000000000000000000000000000000000000000..e5d5e1dbb23842698592b22c9a1c04ff33c9bd0d --- /dev/null +++ b/docstore/73ea86ec-b9a2-4a79-9508-6268829054f9 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/files Title: Files API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/73ff35f6-579d-48d0-b456-037fb148f58a b/docstore/73ff35f6-579d-48d0-b456-037fb148f58a new file mode 100644 index 0000000000000000000000000000000000000000..696658722d20a37964165e2b0fc6d9de79b0d2b8 --- /dev/null +++ b/docstore/73ff35f6-579d-48d0-b456-037fb148f58a @@ -0,0 +1 @@ +main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); You can enable transcription of the audio input by sending input_audio_transcription in setup config. Python import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ], "input_audio_transcription" : {}, } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_data = Path ( "16000.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_data , mime_type = 'audio/pcm;rate=16000' ) ) async for msg in session . receive (): if msg . server_content . input_transcription : print ( 'Transcript:' , msg . server_content . input_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ], inputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( \ No newline at end of file diff --git a/docstore/7404e0a9-f4be-4e1a-b7d0-b4faff8de6df b/docstore/7404e0a9-f4be-4e1a-b7d0-b4faff8de6df new file mode 100644 index 0000000000000000000000000000000000000000..35cb4724ff25e15eeabaea84a051d2545b73055e --- /dev/null +++ b/docstore/7404e0a9-f4be-4e1a-b7d0-b4faff8de6df @@ -0,0 +1 @@ +model to produce concise responses, you can include examples in the prompt that give preference to concise responses. The following prompt provides two examples that show preference to the shorter explanations. In the response, you can see that the examples guided the model to choose the shorter explanation ( Explanation2 ) as opposed to the longer explanation ( Explanation1 ) like it did previously. Prompt: Below are some examples showing a question, explanation, and answer format: Question: Why is the sky blue? Explanation1: The sky appears blue because of Rayleigh scattering, which causes shorter blue wavelengths of light to be scattered more easily than longer red wavelengths, making the sky look blue. Explanation2: Due to Rayleigh scattering effect. Answer: Explanation2 Question: What is the cause of earthquakes? Explanation1: Sudden release of energy in the Earth's crust. Explanation2: Earthquakes happen when tectonic plates suddenly slip or break apart, causing a release of energy that creates seismic waves that can shake the ground and cause damage. Answer: Explanation1 Now, Answer the following question given the example formats above: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Answer: Explanation2 (gemini-2.5-flash) Optimal number of examples Models like Gemini can often pick up on patterns using a few examples, though you may need to experiment with the number of examples to provide in the prompt for the best results. At the same time, if you include too many examples, the model may start to overfit the response to the examples. Patterns vs anti patterns Using examples to show the model a pattern to follow is more effective than using examples to show the model an anti pattern \ No newline at end of file diff --git a/docstore/746fbecf-4d38-43ef-a968-a47bdf113b92 b/docstore/746fbecf-4d38-43ef-a968-a47bdf113b92 new file mode 100644 index 0000000000000000000000000000000000000000..b3339b694e68c4d7176324567b6e6d7542786980 --- /dev/null +++ b/docstore/746fbecf-4d38-43ef-a968-a47bdf113b92 @@ -0,0 +1 @@ +YouTube video per day. For the paid tier, there is no limit based on video length. For models before 2.5, you can upload only 1 video per request. For models after 2.5, you can upload a maximum of 10 videos per request. You can only upload public videos (not private or unlisted videos). The following example shows how to include a YouTube URL with a prompt: Python response = client . models . generate_content ( model = 'models/gemini-2.0-flash' , contents = types . Content ( parts = [ types . Part ( file_data = types . FileData ( file_uri = 'https://www.youtube.com/watch?v=9hE5-98ZeCg' ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( process . env . GOOGLE_API_KEY ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-pro" }); const result = await model . generateContent ([ "Please summarize the video in 3 sentences." , { fileData : { fileUri : "https://www.youtube.com/watch?v=9hE5-98ZeCg" , }, }, ]); console . log ( result . response . text ()); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } parts := [] * genai . Part { genai . NewPartFromText ( "Please summarize the video in 3 sentences." ), genai . NewPartFromURI ( "https://www.youtube.com/watch?v=9hE5-98ZeCg" , "video/mp4" ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Please summarize the video \ No newline at end of file diff --git a/docstore/7492829e-0b6a-4d71-aad5-6b849074beec b/docstore/7492829e-0b6a-4d71-aad5-6b849074beec new file mode 100644 index 0000000000000000000000000000000000000000..e83a53ec98aba4f3d8e71a5d4814fb0a8b6f8f57 --- /dev/null +++ b/docstore/7492829e-0b6a-4d71-aad5-6b849074beec @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/749fc638-3b5b-4ec2-ae3d-12be9f10f53e b/docstore/749fc638-3b5b-4ec2-ae3d-12be9f10f53e new file mode 100644 index 0000000000000000000000000000000000000000..f0ec9c1b62dbb7bddde3f187dd37839fdeb18bbb --- /dev/null +++ b/docstore/749fc638-3b5b-4ec2-ae3d-12be9f10f53e @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/speech-generation#supported-models Title: Speech generation (text-to-speech) | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/74b8ba64-6b71-43d1-ac17-512efcbf7869 b/docstore/74b8ba64-6b71-43d1-ac17-512efcbf7869 new file mode 100644 index 0000000000000000000000000000000000000000..8b1657bd0e8a588bad85661db832fbb0d660fdd6 --- /dev/null +++ b/docstore/74b8ba64-6b71-43d1-ac17-512efcbf7869 @@ -0,0 +1 @@ +serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Incremental content updates Use incremental updates to send text input, establish session context, or restore session context. For short contexts you can send turn-by-turn interactions to represent the exact sequence of events: Python turns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }]}, { "role" : "model" , "parts" : [{ "text" : "Paris" }]}, ] await session . send_client_content ( turns = turns , turn_complete = False ) turns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }]}] await session . send_client_content ( turns = turns , turn_complete = True ) JavaScript let inputTurns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }] }, { "role" : "model" , "parts" : [{ "text" : "Paris" }] }, ] session . sendClientContent ({ turns : inputTurns , turnComplete : false }) inputTurns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }] }] session . sendClientContent ({ turns : inputTurns , turnComplete : true }) For longer contexts \ No newline at end of file diff --git a/docstore/74cdfad7-6f31-4ea4-99dd-17e71e3f5141 b/docstore/74cdfad7-6f31-4ea4-99dd-17e71e3f5141 new file mode 100644 index 0000000000000000000000000000000000000000..84742c0b7906ca5a168857eb7577a7e191bcdffb --- /dev/null +++ b/docstore/74cdfad7-6f31-4ea4-99dd-17e71e3f5141 @@ -0,0 +1 @@ +prompt - urban background, man-made structures, dark, stormy, or threatening atmosphere. Aspect ratios Gemini Veo video generation supports the following two aspect ratios: Aspect ratio Description Widescreen or 16:9 The most common aspect ratio for televisions, monitors, and mobile phone screens (landscape). Use this when you want to capture more of the background, like in scenic landscapes. Portrait or 9:16 Rotated widescreen. This aspect ratio has been popularized by short form video applications, such as Youtube shorts. Use this for portraits or tall objects with strong vertical orientations, such as buildings, trees, waterfall, or buildings. Widescreen This prompt is an example of the widescreen aspect ratio of 16:9. Prompt Generated output Create a video with a tracking drone view of a man driving a red convertible car in Palm Springs, 1970s, warm sunlight, long shadows. Portrait This prompt is an example of the portrait aspect ratio of 9:16. Prompt Generated output Create a video highlighting the smooth motion of a majestic Hawaiian waterfall within a lush rainforest. Focus on realistic water flow, detailed foliage, and natural lighting to convey tranquility. Capture the rushing water, misty atmosphere, and dappled sunlight filtering through the dense canopy. Use smooth, cinematic camera movements to showcase the waterfall and its surroundings. Aim for a peaceful, realistic tone, transporting the viewer to the serene beauty of the Hawaiian rainforest. What's next Gain more experience generating AI videos with the Veo Colab . Check out cool examples using Veo 2 on the Google DeepMind site Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/74e77391-9c49-473b-ba7b-caefe9736fcb b/docstore/74e77391-9c49-473b-ba7b-caefe9736fcb new file mode 100644 index 0000000000000000000000000000000000000000..7c01a8e52c6f77d499351d1468f50bccd1328e77 --- /dev/null +++ b/docstore/74e77391-9c49-473b-ba7b-caefe9736fcb @@ -0,0 +1 @@ +function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); // Combine audio data strings and save as wave file const combinedAudio = turns . reduce (( acc , turn ) = > { if ( turn . data ) { const buffer = Buffer . from ( turn . data , 'base64' ); const intArray = new Int16Array ( buffer . buffer , buffer . byteOffset , buffer . byteLength / Int16Array . BYTES_PER_ELEMENT ); return acc . concat ( Array . from ( intArray )); } return acc ; }, []); const audioBuffer = new Int16Array ( combinedAudio ); const wf = new WaveFile (); wf . fromScratch ( 1 , 24000 , '16' , audioBuffer ); fs . writeFileSync ( 'output.wav' , wf . toBuffer ()); session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Audio formats Audio data in the Live API is always raw, little-endian, 16-bit PCM. Audio output always uses a sample rate of 24kHz. Input audio is natively 16kHz, but the Live API will resample if needed so any sample rate can be sent. To convey the sample rate of input audio, set the MIME type of each audio-containing Blob to a value like audio/pcm;rate=16000 . Audio transcriptions You can enable transcription of the model's audio output by sending output_audio_transcription in the setup config. The transcription language is inferred from the model's response. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ], "output_audio_transcription" : {} } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello? Gemini are you there?" await session . send_client_content ( turns = { \ No newline at end of file diff --git a/docstore/74ecccab-8d4a-4722-b300-6edefb8a7cb4 b/docstore/74ecccab-8d4a-4722-b300-6edefb8a7cb4 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/74ecccab-8d4a-4722-b300-6edefb8a7cb4 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/74f54855-b742-4e74-bc16-4adbbbbecf26 b/docstore/74f54855-b742-4e74-bc16-4adbbbbecf26 new file mode 100644 index 0000000000000000000000000000000000000000..6753b1d43ae21393f7ea777aad5afc4becddf540 --- /dev/null +++ b/docstore/74f54855-b742-4e74-bc16-4adbbbbecf26 @@ -0,0 +1 @@ +response will include a thought_signature field containing an encrypted representation of the model's reasoning. Return the signature: When you send the function's execution result back to the server, include the thought_signature you received. This allows the model to restore its previous thinking context and will likely result in better function calling performance. Receiving signatures from the server Signatures are returned in the part after the model's thinking phase, which typically is a text or function call. Here are some examples of what thought signatures look like returned in each type of part, in response to the request "What's the weather in Lake Tahoe?" using the Get Weather example: Text part [{ "candidates" : [ { "content" : { "parts" : [ { "text" : "Here's what the weather in Lake Tahoe is today" , "thoughtSignature" : "ClcBVKhc7ru7KzUI7SrdUoIdAYLm/+i93aHjfIt4xHyAoO/G70tApxnK2ujBhOhC1PrRy1pkQa88fqFvpHNVd1HDjNLO7mkp6/hFwE+SPPEB3fh0hs4oM8MKhgIBVKhc7uIGvrS7i/T4HpfbnYrluFfWNjZ62gewqe4cVdR/Dlh+zbjtYmDD0gPZ+SuBO7vvHQdzsjePRP+2Y5XddX6LEf/cGGgakq8EhVvw/a6IVzUO6XmpHg2Ag1sl8E9+VFH/lC0R0ZuYdFWligtDuYwp5p5q3o59G0TtWeU2MC1y2MJfE9u/KWd313ldka80/X2W/xF2O/4djMp5G2WKcULfve75zeRCy0mc5iS3SB9mTH0cT6x0vtKjeBx50gcg+CQWtJcRuwTVzz54dmvmK9xvnqA8gKGw3DuaM9wfy5hyY7Qg0z3iyyWdP8T/lbjKim8IEQOk7O1vVwP1Ko7oMYH8JgA1CsoBAVSoXO6v4c5RSyd1cn6EIU0pEFQsjW7rYWPuZdOFq/tsGJT9BCfW7KGkPGwlNSq8jTJFvbcJ/DjtndISQYXwiXd2kGa5JfdS2Kh4zOxCxiWtOk+2nCc3+XQk2nonhO+esGJpkDdbbHZSqRgcUtYKq7q28iPFOQvOFyCiZNB7K86Z/6Hnagu2snSlN/BcTMaFGaWpcCClSUo4foRZn3WbNCoM8rcpD7qEJMp4a5baaSxyyeL1ZTGd2HLpFys/oiW6e3oAnhxuIysCwg==" } ] , "role" : "model" } , "index" : 0 } ] , # Remainder of response... Function call part [{ "candidates" : [ { "content" : { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/74fc1e9a-b838-4d2b-9a98-dcd6bfaab174 b/docstore/74fc1e9a-b838-4d2b-9a98-dcd6bfaab174 new file mode 100644 index 0000000000000000000000000000000000000000..39af00a0c9a9d748f5b17ca6db1ebff5b823541e --- /dev/null +++ b/docstore/74fc1e9a-b838-4d2b-9a98-dcd6bfaab174 @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " Delete uploaded files Files are automatically deleted after 48 hours. You can also manually delete an uploaded file: Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) client . files . delete ( name = myfile . name ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; await ai . files . delete ({ name : fileName }); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } client . DeleteFile ( ctx , file . Name ) REST curl --request "DELETE" https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " Usage info You can use the Files API to upload and interact with media files. The Files API lets you store up to 20 GB of files per project, with a per-file maximum size of 2 GB. Files are stored for 48 hours. During that time, you can use the API to get metadata about the files, but you can't download the files. The Files API is available at no cost in all regions where the Gemini API is available. File prompting strategies This section provides guidance and best practices for using media files with prompts for the Gemini API. Being able to use various types of data in your prompts gives you more flexibility in terms of what tasks you can tackle with the Gemini API. For example, you can send the model a photo of a delicious meal and ask it to write a short blog about the meal. Prompt Response Write a short, engaging blog post based on this picture. It should include a description of the meal in the photo and talk about my journey meal prepping. Meal prepping is a great way to save time and money, and it can also help you to eat healthier. This meal is a great example of a healthy and delicious meal that can be easily prepped ahead of time. This \ No newline at end of file diff --git a/docstore/750d649b-616d-4550-b62f-876e8ad92994 b/docstore/750d649b-616d-4550-b62f-876e8ad92994 new file mode 100644 index 0000000000000000000000000000000000000000..81bd025c83c281ffc71cf5faff30eb0cffa79ae1 --- /dev/null +++ b/docstore/750d649b-616d-4550-b62f-876e8ad92994 @@ -0,0 +1 @@ +Dialog 1 25,000 50 Tier 2 Name Concurrent sessions TPM RPD Gemini 2.5 Flash Live 1000 10,000,000 -- Gemini 2.0 Flash Live 1000 10,000,000 -- Gemini 2.5 Flash Preview Native Audio Dialog 100 1,000,000 Unlimited Gemini 2.5 Flash Experimental Native Audio Thinking Dialog -- -- -- Tier 3 Name Concurrent sessions TPM RPD Gemini 2.5 Flash Live 1000 10,000,000 -- Gemini 2.0 Flash Live 1000 10,000,000 -- Gemini 2.5 Flash Preview Native Audio Dialog -- -- -- Gemini 2.5 Flash Experimental Native Audio Thinking Dialog -- -- -- Specified rate limits are not guaranteed and actual capacity may vary. Batch Mode rate limits Batch Mode requests are subject to their own rate limits, separate from the non-batch mode API calls. Concurrent batch requests: 100 Input file size limit: 2GB File storage limit: 20GB Enqueued tokens per model: The following table outlines the maximum number of tokens that can be enqueued for batch processing across all your active batch jobs for a given model. Tier 1 Model Enqueued Tokens Limit Gemini 2.5 Pro 5,000,000 Gemini 2.5 Flash 3,000,000 Gemini 2.0 Flash 10,000,000 Gemini 2.0 Flash-Lite 10,000,000 Tier 2 Model Enqueued Tokens Limit Gemini 2.5 Pro 500,000,000 Gemini 2.5 Flash 400,000,000 Gemini 2.0 Flash 1,000,000,000 Gemini 2.0 Flash-Lite 1,000,000,000 Tier 3 Model Enqueued Tokens Limit Gemini 2.5 Pro 1,000,000,000 Gemini 2.5 Flash 1,000,000,000 Gemini 2.0 Flash 5,000,000,000 Gemini 2.0 Flash-Lite 5,000,000,000 Specified rate limits are not guaranteed and actual capacity may vary. How to upgrade to the next tier The Gemini API uses Cloud Billing for all billing services. To transition from the Free tier to a paid tier, you must first enable Cloud Billing for your Google Cloud project. Once your project meets the specified criteria, it becomes eligible for an upgrade to the next tier. To request an upgrade, follow these steps: Navigate to the API keys page in AI Studio. Locate the project you want to upgrade and click "Upgrade". The "Upgrade" option \ No newline at end of file diff --git a/docstore/7529e7f1-47c0-4c0f-9dc9-fdefe0d1ef73 b/docstore/7529e7f1-47c0-4c0f-9dc9-fdefe0d1ef73 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/7529e7f1-47c0-4c0f-9dc9-fdefe0d1ef73 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/75417cfa-0a12-44e9-8a93-904621b8b2bc b/docstore/75417cfa-0a12-44e9-8a93-904621b8b2bc new file mode 100644 index 0000000000000000000000000000000000000000..651124da3927c31f504e3c50b4f98f32fbef29df --- /dev/null +++ b/docstore/75417cfa-0a12-44e9-8a93-904621b8b2bc @@ -0,0 +1 @@ +chat . send_message ( message = 'What happened after that?' ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const chat = ai . chats . create ({ model : "gemini-2.0-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const response1 = await chat . sendMessage ({ message : "I have 2 dogs in my house." , }); console . log ( "Chat response 1:" , response1 . text ); const response2 = await chat . sendMessage ({ message : "How many paws are in my house?" , }); console . log ( "Chat response 2:" , response2 . text ); Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } chat , err := client . Chats . Create ( ctx , "gemini-2.0-flash" , nil , nil ) if err != nil { log . Fatal ( err ) } result , err := chat . SendMessage ( ctx , genai . Part { Text : "Hello, I have 2 dogs in my house." }) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing result result , err = chat . SendMessage ( ctx , genai . Part { Text : "How many paws are in my house?" }) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing result Function calling Before Python import google.generativeai as genai from enum import Enum def get_current_weather ( location : str ) - > str : """Get the current whether in a given location. Args: location: required, The city and state, e.g. San Franciso, CA unit: celsius or fahrenheit """ print ( f 'Called with: { location =} ' ) return "23C" model = genai . GenerativeModel ( model_name = "gemini-1.5-flash" , tools = [ get_current_weather ] ) response = model . generate_content ( "What is the weather in San Francisco?" ) function_call = response . candidates [ 0 ] . parts [ 0 ] . function_call After Python In the new SDK, automatic function calling is the \ No newline at end of file diff --git a/docstore/7596b4e9-b274-4701-85c4-c8e1f09606a5 b/docstore/7596b4e9-b274-4701-85c4-c8e1f09606a5 new file mode 100644 index 0000000000000000000000000000000000000000..b0d24ed8267a7db2d3f856003571a245204928ff --- /dev/null +++ b/docstore/7596b4e9-b274-4701-85c4-c8e1f09606a5 @@ -0,0 +1 @@ +voice name from the prebuilt output voices . This example saves the output audio from the model in a wave file: Python from google import genai from google.genai import types import wave # Set up the wave file to save the output: def wave_file ( filename , pcm , channels = 1 , rate = 24000 , sample_width = 2 ): with wave . open ( filename , "wb" ) as wf : wf . setnchannels ( channels ) wf . setsampwidth ( sample_width ) wf . setframerate ( rate ) wf . writeframes ( pcm ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = "Say cheerfully: Have a wonderful day!" , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), ) ) data = response . candidates [ 0 ] . content . parts [ 0 ] . inline_data . data file_name = 'out.wav' wave_file ( file_name , data ) # Saves the file to current directory For more code samples, refer to the "TTS - Get Started" file in the cookbooks repository: View on GitHub JavaScript import { GoogleGenAI } from '@google/genai' ; import wav from 'wav' ; async function saveWaveFile ( filename , pcmData , channels = 1 , rate = 24000 , sampleWidth = 2 , ) { return new Promise (( resolve , reject ) = > { const writer = new wav . FileWriter ( filename , { channels , sampleRate : rate , bitDepth : sampleWidth * 8 , }); writer . on ( 'finish' , resolve ); writer . on ( 'error' , reject ); writer . write ( pcmData ); writer . end (); }); } async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : [{ parts : [{ text : 'Say cheerfully: Have a wonderful day!' }] }], config : { responseModalities : [ 'AUDIO' ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Kore' }, }, }, }, }); \ No newline at end of file diff --git a/docstore/75a0e477-947a-4374-9ff5-403b88b3b230 b/docstore/75a0e477-947a-4374-9ff5-403b88b3b230 new file mode 100644 index 0000000000000000000000000000000000000000..c1b2c9ef7687008f62e5746fd5d26390cb5ee722 --- /dev/null +++ b/docstore/75a0e477-947a-4374-9ff5-403b88b3b230 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-1.5-pro Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/75a742de-9205-4705-b752-51bb1cb4bcec b/docstore/75a742de-9205-4705-b752-51bb1cb4bcec new file mode 100644 index 0000000000000000000000000000000000000000..1540f812946831d7975c774be202eaa83cf52504 --- /dev/null +++ b/docstore/75a742de-9205-4705-b752-51bb1cb4bcec @@ -0,0 +1 @@ +"CiwBVKhc7nRyTi3HmggPD9iQiRc261f5jwuMdw3H/itDH0emsb9ZVo3Nwx9p6wpsAVSoXO5i8fDV4jBSBLoaWxB5zUdlGY6aIGp+I0oEnwRRSRQ1LOvrDlojEH8JE8HjiKXALdJrvNPiG+HY3GZEO8pZjEZtc3UoBUh7+SVyjK7Xolu7aRYYeUyzrCapoETWypER1jbrJXnFV23hCosBAVSoXO6oIPNJSmbuEDfGafOhuCSHkpr1yjTp35RXYqmCESzRzWf5+nFXLqncqeFo4ohoxbiYQVpVQbOZF81p8o9zg6xeRE7qMeOv+XN7enXGJ4/s3qNFQpfkSMqRdBITN1VpX7jyfEAjvxBNc7PDfDJZmEPY338ZIY5nFFcmzJSWjVrboFt2sMFv+A==" } ] , "role" : "model" } , "finishReason" : "STOP" , "index" : 0 } ] , # Remainder of response... You can confirm that you received a signature and see what a signature looks like using the following code: # Step 2: Call the model with function declarations # ...Generation config, Configure the client, and Define user prompt (No changes) # Send request with declarations (using a thinking model) response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents ) # See thought signatures for part in response . candidates [ 0 ] . content . parts : if part . thought_signature : print ( "Thought signature:" ) print ( part . thought_signature ) Returning signatures back to the server In order to return signatures back: You should return signatures along with their containing parts back to the server You shouldn't merge a part with a signature with another part which also contains a signature. The signature string is not concatenable You shouldn't merge one part with a signature with another part without a signature. This breaks the correct positioning of the thought represented by the signature. The code will remain the same as in Step 4 of the previous section. But in this case (as indicated in the comment below) you will return signatures to the model along with the result of the function execution so the model can incorporate the thoughts into its final response: Python # Step 4: Create user friendly response with function result and call the model again # ...Create a function response part (No change) # Append thought \ No newline at end of file diff --git a/docstore/75b7ee60-d404-4394-85ba-e7681bee2795 b/docstore/75b7ee60-d404-4394-85ba-e7681bee2795 new file mode 100644 index 0000000000000000000000000000000000000000..bdac38d149644927b7d126aee3930efd52a696eb --- /dev/null +++ b/docstore/75b7ee60-d404-4394-85ba-e7681bee2795 @@ -0,0 +1 @@ +turn_off_the_lights_schema = { 'name' : 'turn_off_the_lights' } prompt = """ Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? """ tools = [ { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} ] await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Light control schemas const turnOnTheLightsSchema = { name : 'turn_on_the_lights' }; const turnOffTheLightsSchema = { name : 'turn_off_the_lights' }; const prompt = ` Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? ` ; const tools = [ { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } ]; await run ( prompt , tools = tools , modality = "AUDIO" ) Function calling modes The Gemini API lets you control how the model uses the provided tools (function declarations). Specifically, you can set the mode within the. function_calling_config . AUTO (Default) : The model decides whether to generate a natural language response or suggest a function call based on the prompt and context. This is the most flexible mode and recommended for most scenarios. ANY : The model is constrained to always predict a function call and guarantees function schema adherence. If allowed_function_names is not specified, the model can choose from any of the provided function declarations. If allowed_function_names is provided as a list, the model can only choose from the functions in that list. Use this mode when you require a function call response to every prompt (if applicable). NONE : The model is prohibited from making function calls. This is equivalent to sending a request without any function declarations. Use this to temporarily disable function calling without removing your tool definitions. Python from google.genai import types # Configure function calling mode tool_config = types . ToolConfig ( \ No newline at end of file diff --git a/docstore/75ccd79d-09a2-4c29-8d8d-76deb42c0919 b/docstore/75ccd79d-09a2-4c29-8d8d-76deb42c0919 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/75ccd79d-09a2-4c29-8d8d-76deb42c0919 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/75daba6e-f1b4-4a4e-894a-c229ff90c293 b/docstore/75daba6e-f1b4-4a4e-894a-c229ff90c293 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/75daba6e-f1b4-4a4e-894a-c229ff90c293 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/75de3d72-0667-48fe-80c2-7f7d3778c659 b/docstore/75de3d72-0667-48fe-80c2-7f7d3778c659 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/75de3d72-0667-48fe-80c2-7f7d3778c659 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/75eb8830-76aa-458b-8ce2-2c7d1fc097e8 b/docstore/75eb8830-76aa-458b-8ce2-2c7d1fc097e8 new file mode 100644 index 0000000000000000000000000000000000000000..6d0fb3dda30537d366b10ce141412a20984aa796 --- /dev/null +++ b/docstore/75eb8830-76aa-458b-8ce2-2c7d1fc097e8 @@ -0,0 +1 @@ +where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels. """ config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # set thinking_budget to 0 for better results in object detection ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ prompt , im ], # Pillow images can be directly passed as inputs (which will be converted by the SDK) config = config ) # Parse JSON response items = json . loads ( parse_json ( response . text )) # Create output directory os . makedirs ( output_dir , exist_ok = True ) # Process each mask for i , item in enumerate ( items ): # Get bounding box coordinates box = item [ "box_2d" ] y0 = int ( box [ 0 ] / 1000 * im . size [ 1 ]) x0 = int ( box [ 1 ] / 1000 * im . size [ 0 ]) y1 = int ( box [ 2 ] / 1000 * im . size [ 1 ]) x1 = int ( box [ 3 ] / 1000 * im . size [ 0 ]) # Skip invalid boxes if y0 > = y1 or x0 > = x1 : continue # Process mask png_str = item [ "mask" ] if not png_str . startswith ( "data:image/png;base64," ): continue # Remove prefix png_str = png_str . removeprefix ( "data:image/png;base64," ) mask_data = base64 . b64decode ( png_str ) mask = Image . open ( io . BytesIO ( mask_data )) # Resize mask to match bounding box mask = mask . resize (( x1 - x0 , y1 - y0 ), Image . Resampling . BILINEAR ) # Convert mask to numpy array for processing mask_array = np . array ( mask ) # Create overlay for this mask overlay = Image . new ( 'RGBA' , im . size , ( 0 , 0 , 0 , 0 )) overlay_draw = ImageDraw . Draw ( overlay ) # Create overlay for the mask color = ( 255 , 255 , 255 , 200 ) for y in range ( y0 , y1 ): for x in range ( x0 , x1 ): if mask_array [ y - y0 , x - x0 ] > 128 : # Threshold for mask overlay_draw . point (( x , y ), fill = color ) # Save individual mask and its overlay mask_filename = f " { item [ 'label' ] } _ { i } _mask.png" \ No newline at end of file diff --git a/docstore/75fa153f-11de-4fe6-b6d0-c0ad2a0e529b b/docstore/75fa153f-11de-4fe6-b6d0-c0ad2a0e529b new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/75fa153f-11de-4fe6-b6d0-c0ad2a0e529b @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/7627c5c4-5039-4cc9-ac02-38b6214d9505 b/docstore/7627c5c4-5039-4cc9-ac02-38b6214d9505 new file mode 100644 index 0000000000000000000000000000000000000000..0cc2718787a14b981d08f84859160815c71fc626 --- /dev/null +++ b/docstore/7627c5c4-5039-4cc9-ac02-38b6214d9505 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash-preview-tts Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/7632abc6-f709-45f1-b0c7-dfc3650621d8 b/docstore/7632abc6-f709-45f1-b0c7-dfc3650621d8 new file mode 100644 index 0000000000000000000000000000000000000000..ffa55cd17dc266b0e00c821779e2850dd473d215 --- /dev/null +++ b/docstore/7632abc6-f709-45f1-b0c7-dfc3650621d8 @@ -0,0 +1 @@ +"Error: { batch_job . error } " ) Retrieving results Once the job status indicates your batch job has succeeded, the results are available in the response field. Python import json # Use the name of the job you want to check # e.g., inline_batch_job.name from the previous step job_name = "YOUR_BATCH_JOB_NAME" batch_job = client . batches . get ( name = job_name ) if batch_job . state . name == 'JOB_STATE_SUCCEEDED' : # If batch job was created with a file if batch_job . dest and batch_job . dest . file_name : # Results are in a file result_file_name = batch_job . dest . file_name print ( f "Results are in file: { result_file_name } " ) print ( "Downloading result file content..." ) file_content = client . files . download ( file = result_file_name ) # Process file_content (bytes) as needed print ( file_content . decode ( 'utf-8' )) # If batch job was created with inline request elif batch_job . dest and batch_job . dest . inlined_responses : # Results are inline print ( "Results are inline:" ) for i , inline_response in enumerate ( batch_job . dest . inlined_responses ): print ( f "Response { i + 1 } :" ) if inline_response . response : # Accessing response, structure may vary. try : print ( inline_response . response . text ) except AttributeError : print ( inline_response . response ) # Fallback elif inline_response . error : print ( f "Error: { inline_response . error } " ) else : print ( "No results found (neither file nor inline)." ) else : print ( f "Job did not succeed. Final state: { batch_job . state . name } " ) if batch_job . error : print ( f "Error: { batch_job . error } " ) REST BATCH_NAME = "batches/123456" # Your batch job name curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type:application/json" 2 > /dev/null > batch_status.json if jq -r '.done' batch_status.json | grep -q "false" ; then echo "Batch has not finished processing" fi batch_state = $( jq -r '.metadata.state' \ No newline at end of file diff --git a/docstore/7634daf1-e912-4f89-b5a4-b008454cfc1f b/docstore/7634daf1-e912-4f89-b5a4-b008454cfc1f new file mode 100644 index 0000000000000000000000000000000000000000..b56dfbca2298dddb94eb66ba2a6b5e2a4d7109fa --- /dev/null +++ b/docstore/7634daf1-e912-4f89-b5a4-b008454cfc1f @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "' " ${ MIME1_TYPE } " '", "file_uri": ' $file1_uri '}}, { "inline_data": { "mime_type":"' " ${ MIME2_TYPE } " '", "data": "' " $IMAGE2_BASE64 " '" } } ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Object detection From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. Python from google import genai from google.genai import types from PIL import Image import json client = genai . Client () prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image . open ( "/path/to/image.png" ) config = types . GenerateContentConfig ( response_mime_type = "application/json" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , prompt ], config = config ) width , height = image . size bounding_boxes = json . loads ( response . text ) converted_bounding_boxes = [] for bounding_box in bounding_boxes : abs_y1 = int ( bounding_box [ "box_2d" ][ 0 ] / 1000 * height ) abs_x1 = int ( bounding_box [ "box_2d" ][ 1 ] / 1000 * width ) abs_y2 = int ( bounding_box [ "box_2d" ][ 2 ] / 1000 * height ) abs_x2 = int ( bounding_box [ "box_2d" ][ 3 ] / 1000 * width ) converted_bounding_boxes . append ([ abs_x1 , abs_y1 , abs_x2 , abs_y2 ]) print ( "Image size: " , width , height ) print ( "Bounding boxes:" , converted_bounding_boxes ) Note: The model also supports generating bounding boxes based on custom \ No newline at end of file diff --git a/docstore/76574108-efdc-48d9-92da-54d3e0992d79 b/docstore/76574108-efdc-48d9-92da-54d3e0992d79 new file mode 100644 index 0000000000000000000000000000000000000000..a738473ae04afa5765dcb62bc90b497203b4a869 --- /dev/null +++ b/docstore/76574108-efdc-48d9-92da-54d3e0992d79 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/url-context Title: URL context | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/7666f0a7-420d-4a8e-aad2-9d66dd12f8dc b/docstore/7666f0a7-420d-4a8e-aad2-9d66dd12f8dc new file mode 100644 index 0000000000000000000000000000000000000000..65512c01f777d6e5e3b37a55514cc97b01015837 --- /dev/null +++ b/docstore/7666f0a7-420d-4a8e-aad2-9d66dd12f8dc @@ -0,0 +1 @@ +Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Affective dialog This feature lets Gemini adapt its response style to the input expression and tone. To use affective dialog, set the api version to v1alpha and set enable_affective_dialog to true in the setup message: Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], enable_affective_dialog = True ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], enableAffectiveDialog : true }; Note that affective dialog is currently only supported by the native audio output models. Proactive audio When this feature is enabled, Gemini can proactively decide not to respond if the content is not relevant. To use it, set the api version to v1alpha and configure the proactivity field in the setup message and set proactive_audio to true : Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], proactivity = { 'proactive_audio' : True } ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], proactivity : { proactiveAudio : true } } Note that proactive audio is currently only supported by the native audio output models. Native audio output with thinking Native audio output supports thinking capabilities , available via a separate model gemini-2.5-flash-exp-native-audio-thinking-dialog . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-exp-native-audio-thinking-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio \ No newline at end of file diff --git a/docstore/766d2a1a-86ea-477c-80c6-4d36a2011944 b/docstore/766d2a1a-86ea-477c-80c6-4d36a2011944 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/766d2a1a-86ea-477c-80c6-4d36a2011944 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/76dad70a-df79-4b08-923f-5f0558261f0d b/docstore/76dad70a-df79-4b08-923f-5f0558261f0d new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/76dad70a-df79-4b08-923f-5f0558261f0d @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/76fe05e7-31c4-43cc-b7dd-72f284ec3200 b/docstore/76fe05e7-31c4-43cc-b7dd-72f284ec3200 new file mode 100644 index 0000000000000000000000000000000000000000..5389b5d9d1b7115f0b4483b4e1da807b20a5cfd5 --- /dev/null +++ b/docstore/76fe05e7-31c4-43cc-b7dd-72f284ec3200 @@ -0,0 +1 @@ +such as object . properties (object): Lists individual parameters, each with: type (string): The data type of the parameter, such as string , integer , boolean, array . description (string): A description of the parameter's purpose and format. Provide examples and constraints ("The city and state, e.g., 'San Francisco, CA' or a zip code e.g., '95616'."). enum (array, optional): If the parameter values are from a fixed set, use "enum" to list the allowed values instead of just describing them in the description. This improves accuracy ("enum": ["daylight", "cool", "warm"]). required (array): An array of strings listing the parameter names that are mandatory for the function to operate. Function calling with thinking Enabling "thinking" can improve function call performance by allowing the model to reason through a request before suggesting function calls. However, because the Gemini API is stateless, this reasoning context is lost between turns, which can reduce the quality of function calls as they require multiple turn requests. To preserve this context you can use thought signatures. A thought signature is an encrypted representation of the model's internal thought process that you pass back to the model on subsequent turns. To use thought signatures: Receive the signature: When thinking is enabled, the API response will include a thought_signature field containing an encrypted representation of the model's reasoning. Return the signature: When you send the function's execution result back to the server, include the thought_signature you received. This allows the model to restore its previous thinking context and will likely result in better function calling performance. Receiving signatures from the server Signatures are returned in the part after the model's thinking phase, which typically is a text or function call. Here are some examples of what thought signatures look like returned in each type of part, in response to the request "What's the weather in Lake \ No newline at end of file diff --git a/docstore/77020056-79e3-4d35-b769-a303d5c7c69e b/docstore/77020056-79e3-4d35-b769-a303d5c7c69e new file mode 100644 index 0000000000000000000000000000000000000000..109bd0eaeef926ee02f7659dee842724a4ad9423 --- /dev/null +++ b/docstore/77020056-79e3-4d35-b769-a303d5c7c69e @@ -0,0 +1 @@ +the libraries are stable and fully supported for production use. They are actively maintained, provide access to the latest features, and offer the best performance working with Gemini. If you're not using the Google GenAI SDK and using one of our legacy libraries, we strongly recommend you to migrate. Review the legacy libraries section for more information. Legacy libraries and migration If you are using one of our legacy libraries, we recommend that you migrate to the new libraries . The legacy libraries don't provide access to recent features (such as Live API and Veo ) and are on a deprecation path. They will stop receiving updates at the end of September 2025, the feature gaps will grow and potential bugs may no longer get fixed. Each legacy library's support status varies, detailed in the following table: Language Legacy library Support status Recommended library Python google-generativeai All support, including bug fixes, ends end of September 2025. google-genai JavaScript/TypeScript @google/generativeai All support, including bug fixes, ends end of September 2025. @google/genai Go google.golang.org/generative-ai All support, including bug fixes, ends end of September 2025. google.golang.org/genai Dart and Flutter google_generative_ai Not actively maintained Use trusted community or third party libraries, like firebase_ai , or access using REST API Swift generative-ai-swift Not actively maintained Use Gemini in Firebase Android generative-ai-android Not actively maintained Use Gemini in Firebase Note for Java developers: There was no legacy Google-provided Java SDK for the Gemini API, so no migration from a previous Google library is required. You can start directly with the new library in the Language support and installation section. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google \ No newline at end of file diff --git a/docstore/7716115f-6f17-471e-9f4d-8d33753ceb24 b/docstore/7716115f-6f17-471e-9f4d-8d33753ceb24 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/7716115f-6f17-471e-9f4d-8d33753ceb24 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/7727318e-06ae-4359-ad07-3bb9a78ee789 b/docstore/7727318e-06ae-4359-ad07-3bb9a78ee789 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/7727318e-06ae-4359-ad07-3bb9a78ee789 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/7736fdb8-f31a-4911-b54a-2ad422705505 b/docstore/7736fdb8-f31a-4911-b54a-2ad422705505 new file mode 100644 index 0000000000000000000000000000000000000000..448fb0d86f65fba6c1ebada31e49a4124dab31aa --- /dev/null +++ b/docstore/7736fdb8-f31a-4911-b54a-2ad422705505 @@ -0,0 +1 @@ +message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "16000.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . log ( "Transcription" ) console . log ( turn . serverContent . outputTranscription . text ); } } for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } else if ( turn . serverContent && turn . serverContent . inputTranscription ) { console . debug ( 'Received input transcription: %s\n' , turn . serverContent . inputTranscription . text ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Stream audio and video To see an example of how to use the Live API in a streaming audio and video format, run the "Live API - Get Started" file in the cookbooks repository: View on Colab Change voice and language The Live API models each support a different set of voices. Half-cascade supports Puck, Charon, Kore, Fenrir, Aoede, Leda, Orus, and Zephyr. Native audio supports a much longer list (identical to the TTS model list \ No newline at end of file diff --git a/docstore/77632fb4-81c5-46fe-9557-dda89e19651e b/docstore/77632fb4-81c5-46fe-9557-dda89e19651e new file mode 100644 index 0000000000000000000000000000000000000000..65512c01f777d6e5e3b37a55514cc97b01015837 --- /dev/null +++ b/docstore/77632fb4-81c5-46fe-9557-dda89e19651e @@ -0,0 +1 @@ +Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Affective dialog This feature lets Gemini adapt its response style to the input expression and tone. To use affective dialog, set the api version to v1alpha and set enable_affective_dialog to true in the setup message: Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], enable_affective_dialog = True ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], enableAffectiveDialog : true }; Note that affective dialog is currently only supported by the native audio output models. Proactive audio When this feature is enabled, Gemini can proactively decide not to respond if the content is not relevant. To use it, set the api version to v1alpha and configure the proactivity field in the setup message and set proactive_audio to true : Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], proactivity = { 'proactive_audio' : True } ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], proactivity : { proactiveAudio : true } } Note that proactive audio is currently only supported by the native audio output models. Native audio output with thinking Native audio output supports thinking capabilities , available via a separate model gemini-2.5-flash-exp-native-audio-thinking-dialog . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-exp-native-audio-thinking-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio \ No newline at end of file diff --git a/docstore/776839ac-4b6d-4eed-b5ea-c4134e3c3946 b/docstore/776839ac-4b6d-4eed-b5ea-c4134e3c3946 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/776839ac-4b6d-4eed-b5ea-c4134e3c3946 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/778792de-6b01-4438-b1c6-f91047786da3 b/docstore/778792de-6b01-4438-b1c6-f91047786da3 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/778792de-6b01-4438-b1c6-f91047786da3 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/77898192-26e7-4247-b054-38ce3e5c126b b/docstore/77898192-26e7-4247-b054-38ce3e5c126b new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/77898192-26e7-4247-b054-38ce3e5c126b @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/77912a0a-ea8e-4139-bca2-27d55a6564ab b/docstore/77912a0a-ea8e-4139-bca2-27d55a6564ab new file mode 100644 index 0000000000000000000000000000000000000000..66c7a16418e8d8187da2dc0d9183ed5aa54d5fe4 --- /dev/null +++ b/docstore/77912a0a-ea8e-4139-bca2-27d55a6564ab @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/live-guide#main-content Title: Live API capabilities guide | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/77cec5d9-1391-4fb5-9ae3-b9f0dbf2d414 b/docstore/77cec5d9-1391-4fb5-9ae3-b9f0dbf2d414 new file mode 100644 index 0000000000000000000000000000000000000000..d50d50f040511131a80be9882c48fc298ef214ff --- /dev/null +++ b/docstore/77cec5d9-1391-4fb5-9ae3-b9f0dbf2d414 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/structured-output#main-content Title: Structured output | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/77dfd4b5-d7db-442b-88a2-f160724aa7f4 b/docstore/77dfd4b5-d7db-442b-88a2-f160724aa7f4 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/77dfd4b5-d7db-442b-88a2-f160724aa7f4 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/77eda493-38e5-48a3-81e4-d02ce3216f95 b/docstore/77eda493-38e5-48a3-81e4-d02ce3216f95 new file mode 100644 index 0000000000000000000000000000000000000000..c7d99b48acdb29ebe1cdd75df52d7215dd4d0ab1 --- /dev/null +++ b/docstore/77eda493-38e5-48a3-81e4-d02ce3216f95 @@ -0,0 +1 @@ +- Zsh Zsh is a common Linux and macOS terminal configuration. You can check if you have a configuration file for it by running the following command: ~/.zshrc If the response is "No such file or directory", you will need to create this file and open it by running the following commands, or use bash : touch ~/.zshrc open ~/.zshrc Next, you need to set your API key by adding the following export command: export GEMINI_API_KEY = After saving the file, apply the changes by running: source ~/.zshrc Windows Search for "Environment Variables" in the system settings Edit either "User variables" (for current user) or "System variables" (for all users - use with caution). Create the variable and add export GEMINI_API_KEY=your_key_here Apply the changes Providing API key explicitly In some cases, you may want to explicitly provide an API key. For example: You're doing a simple API call and prefer hard coding the API key. You want explicit control without having to rely on automatic discovery of environment variables by the Gemini API libraries You're using an environment where environment variables are not supported (e.g web) or you are making REST calls. Below are examples for how you can provide an API key explicitly: Python from google import genai client = genai . Client ( api_key = " YOUR_API_KEY " ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Explain how AI works in a few words" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : " YOUR_API_KEY " }); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works in a few words" , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , & genai . \ No newline at end of file diff --git a/docstore/7836bfdd-ef10-4a3f-9673-44f413ceb227 b/docstore/7836bfdd-ef10-4a3f-9673-44f413ceb227 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/7836bfdd-ef10-4a3f-9673-44f413ceb227 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/7843bb5d-4827-4f7c-9954-5dddd022e316 b/docstore/7843bb5d-4827-4f7c-9954-5dddd022e316 new file mode 100644 index 0000000000000000000000000000000000000000..6753b1d43ae21393f7ea777aad5afc4becddf540 --- /dev/null +++ b/docstore/7843bb5d-4827-4f7c-9954-5dddd022e316 @@ -0,0 +1 @@ +response will include a thought_signature field containing an encrypted representation of the model's reasoning. Return the signature: When you send the function's execution result back to the server, include the thought_signature you received. This allows the model to restore its previous thinking context and will likely result in better function calling performance. Receiving signatures from the server Signatures are returned in the part after the model's thinking phase, which typically is a text or function call. Here are some examples of what thought signatures look like returned in each type of part, in response to the request "What's the weather in Lake Tahoe?" using the Get Weather example: Text part [{ "candidates" : [ { "content" : { "parts" : [ { "text" : "Here's what the weather in Lake Tahoe is today" , "thoughtSignature" : "ClcBVKhc7ru7KzUI7SrdUoIdAYLm/+i93aHjfIt4xHyAoO/G70tApxnK2ujBhOhC1PrRy1pkQa88fqFvpHNVd1HDjNLO7mkp6/hFwE+SPPEB3fh0hs4oM8MKhgIBVKhc7uIGvrS7i/T4HpfbnYrluFfWNjZ62gewqe4cVdR/Dlh+zbjtYmDD0gPZ+SuBO7vvHQdzsjePRP+2Y5XddX6LEf/cGGgakq8EhVvw/a6IVzUO6XmpHg2Ag1sl8E9+VFH/lC0R0ZuYdFWligtDuYwp5p5q3o59G0TtWeU2MC1y2MJfE9u/KWd313ldka80/X2W/xF2O/4djMp5G2WKcULfve75zeRCy0mc5iS3SB9mTH0cT6x0vtKjeBx50gcg+CQWtJcRuwTVzz54dmvmK9xvnqA8gKGw3DuaM9wfy5hyY7Qg0z3iyyWdP8T/lbjKim8IEQOk7O1vVwP1Ko7oMYH8JgA1CsoBAVSoXO6v4c5RSyd1cn6EIU0pEFQsjW7rYWPuZdOFq/tsGJT9BCfW7KGkPGwlNSq8jTJFvbcJ/DjtndISQYXwiXd2kGa5JfdS2Kh4zOxCxiWtOk+2nCc3+XQk2nonhO+esGJpkDdbbHZSqRgcUtYKq7q28iPFOQvOFyCiZNB7K86Z/6Hnagu2snSlN/BcTMaFGaWpcCClSUo4foRZn3WbNCoM8rcpD7qEJMp4a5baaSxyyeL1ZTGd2HLpFys/oiW6e3oAnhxuIysCwg==" } ] , "role" : "model" } , "index" : 0 } ] , # Remainder of response... Function call part [{ "candidates" : [ { "content" : { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/785acfaf-2a12-4805-9d4b-a182422a634a b/docstore/785acfaf-2a12-4805-9d4b-a182422a634a new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/785acfaf-2a12-4805-9d4b-a182422a634a @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/785f87c9-c0c0-4f66-875a-61c5ce8afc63 b/docstore/785f87c9-c0c0-4f66-875a-61c5ce8afc63 new file mode 100644 index 0000000000000000000000000000000000000000..b0571e28c8e74f7e3e23139b08c0865b24edbd38 --- /dev/null +++ b/docstore/785f87c9-c0c0-4f66-875a-61c5ce8afc63 @@ -0,0 +1 @@ +And you can also pass the schema as JSON: Python from google import genai client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = 'What type of instrument is an oboe?' , config = { 'response_mime_type' : 'text/x.enum' , 'response_schema' : { "type" : "STRING" , "enum" : [ "Percussion" , "String" , "Woodwind" , "Brass" , "Keyboard" ], }, }, ) print ( response . text ) # Woodwind Beyond basic multiple choice problems, you can use an enum anywhere in a JSON schema. For example, you could ask the model for a list of recipe titles and use a Grade enum to give each title a popularity grade: Python from google import genai import enum from pydantic import BaseModel class Grade ( enum . Enum ): A_PLUS = "a+" A = "a" B = "b" C = "c" D = "d" F = "f" class Recipe ( BaseModel ): recipe_name : str rating : Grade client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = 'List 10 home-baked cookie recipes and give them grades based on tastiness.' , config = { 'response_mime_type' : 'application/json' , 'response_schema' : list [ Recipe ], }, ) print ( response . text ) The response might look like this: [ { "recipe_name" : "Chocolate Chip Cookies" , "rating" : "a+" }, { "recipe_name" : "Peanut Butter Cookies" , "rating" : "a" }, { "recipe_name" : "Oatmeal Raisin Cookies" , "rating" : "b" }, ... ] About JSON schemas Configuring the model for JSON output using responseSchema parameter relies on Schema object to define its structure. This object represents a select subset of the OpenAPI 3.0 Schema object , and also adds a propertyOrdering field. Tip: On Python, when you use a Pydantic model, you don't need to directly work with Schema objects, as it gets automatically converted to the corresponding JSON schema. To learn more, see JSON schemas in Python . Here's a pseudo-JSON representation of all the Schema fields: { "type": enum (Type), "format": string, "description": \ No newline at end of file diff --git a/docstore/7878e20a-1fbe-4f95-aba8-a3c74a80491a b/docstore/7878e20a-1fbe-4f95-aba8-a3c74a80491a new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/7878e20a-1fbe-4f95-aba8-a3c74a80491a @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/788506fc-7ecd-435c-aba7-e439cef8ca9f b/docstore/788506fc-7ecd-435c-aba7-e439cef8ca9f new file mode 100644 index 0000000000000000000000000000000000000000..4542d68c23b70ac5014267f60edbcffd2138877f --- /dev/null +++ b/docstore/788506fc-7ecd-435c-aba7-e439cef8ca9f @@ -0,0 +1 @@ +, headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Streaming can also be used for multi-turn conversations. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message_stream ( "I have 2 dogs in my house." ) for chunk in response : print ( chunk . text , end = "" ) response = chat . send_message_stream ( "How many paws are in my house?" ) for chunk in response : print ( chunk . text , end = "" ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const stream1 = await chat . sendMessageStream ({ message : "I have 2 dogs in my house." , }); for await ( const chunk of stream1 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } const stream2 = await chat . sendMessageStream ({ message : "How many paws are in my house?" , }); for await ( const chunk of stream2 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . \ No newline at end of file diff --git a/docstore/78866ec2-1b54-4583-9fa0-2507ab007e9c b/docstore/78866ec2-1b54-4583-9fa0-2507ab007e9c new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/78866ec2-1b54-4583-9fa0-2507ab007e9c @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/78cb9dd9-2fd0-431e-90f5-c1556e9073a8 b/docstore/78cb9dd9-2fd0-431e-90f5-c1556e9073a8 new file mode 100644 index 0000000000000000000000000000000000000000..97aa001e09ba70bb61a006545e9c3b23aa75a8fa --- /dev/null +++ b/docstore/78cb9dd9-2fd0-431e-90f5-c1556e9073a8 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.5-flash Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/78d68f7c-6b00-4ca0-bb05-22f017b666f9 b/docstore/78d68f7c-6b00-4ca0-bb05-22f017b666f9 new file mode 100644 index 0000000000000000000000000000000000000000..2dd19cc1c2d77863f4a301d62456871308dd74fd --- /dev/null +++ b/docstore/78d68f7c-6b00-4ca0-bb05-22f017b666f9 @@ -0,0 +1 @@ +"temperature" ], }, }, ], }, ]; // Prompt for the model let contents = [ { role : "user" , parts : [ { text : "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , }, ], }, ]; // Loop until the model has no more function calls to make while ( true ) { const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents , config : { tools }, }); if ( result . functionCalls && result . functionCalls . length > 0 ) { const functionCall = result . functionCalls [ 0 ]; const { name , args } = functionCall ; if ( ! toolFunctions [ name ]) { throw new Error ( `Unknown function call: ${ name } ` ); } // Call the function and get the response. const toolResponse = toolFunctions [ name ]( args ); const functionResponsePart = { name : functionCall . name , response : { result : toolResponse , }, }; // Send the function response back to the model. contents . push ({ role : "model" , parts : [ { functionCall : functionCall , }, ], }); contents . push ({ role : "user" , parts : [ { functionResponse : functionResponsePart , }, ], }); } else { // No more function calls, break the loop. console . log ( result . text ); break ; } } Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . It 's 25°C in London, so I' ve set the thermostat to 20 ° C . Compositional function calling is a native Live API feature. This means Live API can handle the function calling similar to the Python SDK. Python # Light control schemas turn_on_the_lights_schema = { 'name' : 'turn_on_the_lights' } \ No newline at end of file diff --git a/docstore/78e53d10-158d-4909-b31b-6b446dd82923 b/docstore/78e53d10-158d-4909-b31b-6b446dd82923 new file mode 100644 index 0000000000000000000000000000000000000000..58a0a85d29a664bbdbdb8d3a93c58460ef8ef554 --- /dev/null +++ b/docstore/78e53d10-158d-4909-b31b-6b446dd82923 @@ -0,0 +1 @@ += []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); And here is a text-to-audio example. You can receive audio by setting AUDIO as response modality. This example saves the received data as WAV file: \ No newline at end of file diff --git a/docstore/79099489-fce3-46aa-8723-6ad2f28d7581 b/docstore/79099489-fce3-46aa-8723-6ad2f28d7581 new file mode 100644 index 0000000000000000000000000000000000000000..bbc4019685cdf16085ca79e5df30b3ebeb71657f --- /dev/null +++ b/docstore/79099489-fce3-46aa-8723-6ad2f28d7581 @@ -0,0 +1 @@ +"role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . server_content . model_turn : print ( "Model turn:" , response . server_content . model_turn ) if response . server_content . output_transcription : print ( "Transcript:" , response . server_content . output_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ], outputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . debug ( 'Received output transcription: %s\n' , turn . serverContent . outputTranscription . text ); } } session . close (); } async function \ No newline at end of file diff --git a/docstore/7917b535-77c5-4574-910f-2f9751d19153 b/docstore/7917b535-77c5-4574-910f-2f9751d19153 new file mode 100644 index 0000000000000000000000000000000000000000..33a8b238b28b3b4e6fb2252f6f1e5e7807510cc2 --- /dev/null +++ b/docstore/7917b535-77c5-4574-910f-2f9751d19153 @@ -0,0 +1 @@ +used to create the audio response: Native audio : This option provides the most natural and realistic-sounding speech and better multilingual performance. It also enables advanced features like affective (emotion-aware) dialogue , proactive audio (where the model can decide to ignore or respond to certain inputs), and "thinking" . Native audio is supported by the following native audio models : gemini-2.5-flash-preview-native-audio-dialog gemini-2.5-flash-exp-native-audio-thinking-dialog Half-cascade audio : This option uses a cascaded model architecture (native audio input and text-to-speech output). It offers better performance and reliability in production environments, especially with tool use . Half-cascaded audio is supported by the following models: gemini-live-2.5-flash-preview gemini-2.0-flash-live-001 Choose an implementation approach When integrating with Live API, you'll need to choose one of the following implementation approaches: Server-to-server : Your backend connects to the Live API using WebSockets . Typically, your client sends stream data (audio, video, text) to your server, which then forwards it to the Live API. Client-to-server : Your frontend code connects directly to the Live API using WebSockets to stream data, bypassing your backend. Note: Client-to-server generally offers better performance for streaming audio and video, since it bypasses the need to send the stream to your backend first. It's also easier to set up since you don't need to implement a proxy that sends data from your client to your server and then your server to the API. However, for production environments, in order to mitigate security risks, we recommend using ephemeral tokens instead of standard API keys. Get started This example reads a WAV file , sends it in the correct format, and saves the received data as WAV file. You can send audio by converting it to 16-bit PCM, 16kHz, mono format, and you can receive audio by setting AUDIO as response modality. The output uses \ No newline at end of file diff --git a/docstore/791b8c83-99f0-4b27-9e80-dec5c8b7a008 b/docstore/791b8c83-99f0-4b27-9e80-dec5c8b7a008 new file mode 100644 index 0000000000000000000000000000000000000000..13654164ec6e7048af241ec2efbc514257c312b5 --- /dev/null +++ b/docstore/791b8c83-99f0-4b27-9e80-dec5c8b7a008 @@ -0,0 +1 @@ +For alternative methods of providing images and more advanced image processing, see our image understanding guide . The API also supports document , video , and audio inputs and understanding. Streaming responses By default, the model returns a response only after the entire generation process is complete. For more fluid interactions, use streaming to receive GenerateContentResponse instances incrementally as they're generated. Python from google import genai client = genai . Client () response = client . models . generate_content_stream ( model = "gemini-2.5-flash" , contents = [ "Explain how AI works" ] ) for chunk in response : print ( chunk . text , end = "" ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , }); for await ( const chunk of response ) { console . log ( chunk . text ); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } stream := client . Models . GenerateContentStream ( ctx , "gemini-2.5-flash" , genai . Text ( "Write a story about a magic backpack." ), nil , ) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt=sse" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ --no-buffer \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main \ No newline at end of file diff --git a/docstore/7930cbae-0aaf-435a-8d95-17948adae109 b/docstore/7930cbae-0aaf-435a-8d95-17948adae109 new file mode 100644 index 0000000000000000000000000000000000000000..a93f111d87cca3d0226fc4cdf96775703aa03950 --- /dev/null +++ b/docstore/7930cbae-0aaf-435a-8d95-17948adae109 @@ -0,0 +1 @@ +genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMAGE_PATH = "path/to/sample.jpg" MIME_TYPE = $( file -b --mime-type " ${ IMAGE_PATH } " ) NUM_BYTES = $( wc -c < " ${ IMAGE_PATH } " ) DISPLAY_NAME = IMAGE tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Caption this image."}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Prompting with multiple images You can provide multiple images in a \ No newline at end of file diff --git a/docstore/79310985-f861-4b7d-b9cc-38b674a524ec b/docstore/79310985-f861-4b7d-b9cc-38b674a524ec new file mode 100644 index 0000000000000000000000000000000000000000..dfeae8fcf584330ed11cdd48e07105d5f4f56b31 --- /dev/null +++ b/docstore/79310985-f861-4b7d-b9cc-38b674a524ec @@ -0,0 +1 @@ +retrieval_tool ] ) response = client . models . generate_content ( model = 'gemini-1.5-flash' , contents = "Who won the euro 2024?" , config = config , ) print ( response . text ) if not response . candidates [ 0 ] . grounding_metadata : print ( " \n Model answered from its own knowledge." ) JavaScript // Note: This is a legacy approach for Gemini 1.5 models. // The 'googleSearch' tool is recommended for all new development. import { GoogleGenAI , DynamicRetrievalConfigMode } from "@google/genai" ; const ai = new GoogleGenAI ({}); const retrievalTool = { googleSearchRetrieval : { dynamicRetrievalConfig : { mode : DynamicRetrievalConfigMode . MODE_DYNAMIC , dynamicThreshold : 0.7 , // Only search if confidence > 70% }, }, }; const config = { tools : [ retrievalTool ], }; const response = await ai . models . generateContent ({ model : "gemini-1.5-flash" , contents : "Who won the euro 2024?" , config , }); console . log ( response . text ); if ( ! response . candidates ? .[ 0 ] ? . groundingMetadata ) { console . log ( "\nModel answered from its own knowledge." ); } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -X POST \ -d '{ "contents": [ {"parts": [{"text": "Who won the euro 2024?"}]} ], "tools": [{ "google_search_retrieval": { "dynamic_retrieval_config": { "mode": "MODE_DYNAMIC", "dynamic_threshold": 0.7 } } }] }' What's next Try the Grounding with Google Search in the Gemini API Cookbook . Learn about other available tools, like Function Calling . Learn how to augment prompts with specific URLs using the URL context tool . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. \ No newline at end of file diff --git a/docstore/793b70cc-4da0-41b9-8675-3705ee95109e b/docstore/793b70cc-4da0-41b9-8675-3705ee95109e new file mode 100644 index 0000000000000000000000000000000000000000..b02538f85c1e26824fb9d15e124ac354f46dfed1 --- /dev/null +++ b/docstore/793b70cc-4da0-41b9-8675-3705ee95109e @@ -0,0 +1 @@ +temperature : int ) - > dict : """Sets the thermostat to a desired temperature.""" print ( f "Tool Call: set_thermostat_temperature(temperature= { temperature } )" ) # TODO: Interact with a thermostat API print ( "Tool Response: {'status': 'success'}" ) return { "status" : "success" } # Configure the client and model client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_weather_forecast , set_thermostat_temperature ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , config = config , ) # Print the final, user-facing response print ( response . text ) Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . I 've set the thermostat to 20°C. JavaScript This example shows how to use JavaScript/TypeScript SDK to do comopositional function calling using a manual execution loop. import { GoogleGenAI , Type } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Example Functions function get_weather_forecast ({ location }) { console . log ( `Tool Call: get_weather_forecast(location= ${ location } )` ); // TODO: Make API call console . log ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ); return { temperature : 25 , unit : "celsius" }; } function set_thermostat_temperature ({ temperature }) { console . log ( `Tool Call: set_thermostat_temperature(temperature= ${ temperature } )` , ); // TODO: Make API call console . log ( "Tool \ No newline at end of file diff --git a/docstore/79506e48-c6fc-4b4e-b5b2-cc790b68e0f8 b/docstore/79506e48-c6fc-4b4e-b5b2-cc790b68e0f8 new file mode 100644 index 0000000000000000000000000000000000000000..0d39ec227c9e73c3d3b5117917b80f593b500b4b --- /dev/null +++ b/docstore/79506e48-c6fc-4b4e-b5b2-cc790b68e0f8 @@ -0,0 +1 @@ +async function main () { const ai = new GoogleGenAI ({}); const imageUrl = "https://goo.gle/instrument-img" ; const response = await fetch ( imageUrl ); const imageArrayBuffer = await response . arrayBuffer (); const base64ImageData = Buffer . from ( imageArrayBuffer ). toString ( 'base64' ); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ { inlineData : { mimeType : 'image/jpeg' , data : base64ImageData , }, }, { text : "Caption this image." } ], }); console . log ( result . text ); } main (); Go package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Download the image. imageResp , _ := http . Get ( "https://goo.gle/instrument-img" ) imageBytes , _ := io . ReadAll ( imageResp . Body ) parts := [] * genai . Part { genai . NewPartFromBytes ( imageBytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMG_URL = "https://goo.gle/instrument-img" MIME_TYPE = $( curl -sIL " $IMG_URL " | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1 ) if [[ -z " $MIME_TYPE " || ! " $MIME_TYPE " == image/* ]] ; then MIME_TYPE = "image/jpeg" fi # Check for macOS if [[ " $( uname ) " == "Darwin" ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -b 0 ) elif [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 ) else IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -w0 ) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ \ No newline at end of file diff --git a/docstore/79695667-baf2-4afb-9f6a-9a41281f5b5c b/docstore/79695667-baf2-4afb-9f6a-9a41281f5b5c new file mode 100644 index 0000000000000000000000000000000000000000..8ae055ee25ee10e0bf5368b2d5c01f7fd2abd6b2 --- /dev/null +++ b/docstore/79695667-baf2-4afb-9f6a-9a41281f5b5c @@ -0,0 +1 @@ +model can then retrieve content from the URLs and use that content to inform and shape its response. You can try examples of using tools with thinking models in the Thinking cookbook . What's next? To work through more in depth examples, like: Using tools with thinking Streaming with thinking Adjusting the thinking budget for different results and more, try our Thinking cookbook . Thinking coverage is now available in our OpenAI Compatibility guide. For more info about Gemini 2.5 Pro, Gemini Flash 2.5, and Gemini 2.5 Flash-Lite, visit the model page . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-11 UTC. \ No newline at end of file diff --git a/docstore/7969bcdb-38ae-4673-8c54-6917def9baab b/docstore/7969bcdb-38ae-4673-8c54-6917def9baab new file mode 100644 index 0000000000000000000000000000000000000000..a93f111d87cca3d0226fc4cdf96775703aa03950 --- /dev/null +++ b/docstore/7969bcdb-38ae-4673-8c54-6917def9baab @@ -0,0 +1 @@ +genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMAGE_PATH = "path/to/sample.jpg" MIME_TYPE = $( file -b --mime-type " ${ IMAGE_PATH } " ) NUM_BYTES = $( wc -c < " ${ IMAGE_PATH } " ) DISPLAY_NAME = IMAGE tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Caption this image."}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Prompting with multiple images You can provide multiple images in a \ No newline at end of file diff --git a/docstore/798d6f7e-ae04-47f2-be81-3005dcbb1e07 b/docstore/798d6f7e-ae04-47f2-be81-3005dcbb1e07 new file mode 100644 index 0000000000000000000000000000000000000000..5b31a2c588785b0dc19769f45b0589a09f2843d3 --- /dev/null +++ b/docstore/798d6f7e-ae04-47f2-be81-3005dcbb1e07 @@ -0,0 +1 @@ +world knowledge and reasoning. Seamlessly blending text and images is important. You want accurate visuals embedded within long text sequences. You want to edit images conversationally while maintaining context. Choose Imagen when: Image quality, photorealism, artistic detail, or specific styles (e.g., impressionism, anime) are top priorities. Performing specialized editing tasks like product background updates or image upscaling. Infusing branding, style, or generating logos and product designs. Imagen 4 should be your go-to model starting to generate images with Imagen. Choose Imagen 4 Ultra for advanced use-cases or when you need the best image quality. Note that Imagen 4 Ultra can only generate one image at a time. Imagen prompt guide This section of the Imagen guide shows you how modifying a text-to-image prompt can produce different results, along with examples of images you can create. Prompt writing basics Note: Maximum prompt length is 480 tokens. A good prompt is descriptive and clear, and makes use of meaningful keywords and modifiers. Start by thinking of your subject , context , and style . Image text: A sketch ( style ) of a modern apartment building ( subject ) surrounded by skyscrapers ( context and background ). Subject : The first thing to think about with any prompt is the subject : the object, person, animal, or scenery you want an image of. Context and background: Just as important is the background or context in which the subject will be placed. Try placing your subject in a variety of backgrounds. For example, a studio with a white background, outdoors, or indoor environments. Style: Finally, add the style of image you want. Styles can be general (painting, photograph, sketches) or very specific (pastel painting, charcoal drawing, isometric 3D). You can also combine styles. After you write a first version of your prompt, refine your prompt by adding more details until you get to the image that you want. Iteration is important. Start by \ No newline at end of file diff --git a/docstore/79f692fd-3b38-461f-baed-8af00b75c407 b/docstore/79f692fd-3b38-461f-baed-8af00b75c407 new file mode 100644 index 0000000000000000000000000000000000000000..2437f77cb02a7dfc3b66d950f0fe4ad8777ea66f --- /dev/null +++ b/docstore/79f692fd-3b38-461f-baed-8af00b75c407 @@ -0,0 +1 @@ +SpeakerVoiceConfig ( speaker = 'Joe' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), types . SpeakerVoiceConfig ( speaker = 'Jane' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Puck' , ) ) ), ] ) ) ) ) data = response . candidates [ 0 ] . content . parts [ 0 ] . inline_data . data file_name = 'out.wav' wave_file ( file_name , data ) # Saves the file to current directory JavaScript import { GoogleGenAI } from '@google/genai' ; import wav from 'wav' ; async function saveWaveFile ( filename , pcmData , channels = 1 , rate = 24000 , sampleWidth = 2 , ) { return new Promise (( resolve , reject ) = > { const writer = new wav . FileWriter ( filename , { channels , sampleRate : rate , bitDepth : sampleWidth * 8 , }); writer . on ( 'finish' , resolve ); writer . on ( 'error' , reject ); writer . write ( pcmData ); writer . end (); }); } async function main () { const ai = new GoogleGenAI ({}); const prompt = `TTS the following conversation between Joe and Jane: Joe: How's it going today Jane? Jane: Not too bad, how about you?` ; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : [{ parts : [{ text : prompt }] }], config : { responseModalities : [ 'AUDIO' ], speechConfig : { multiSpeakerVoiceConfig : { speakerVoiceConfigs : [ { speaker : 'Joe' , voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Kore' } } }, { speaker : 'Jane' , voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Puck' } } } ] } } } }); const data = response . candidates ? .[ 0 ] ? . content ? . parts ? .[ 0 ] ? . inlineData ? . data ; const audioBuffer = Buffer . from ( data , 'base64' ); const fileName = 'out.wav' ; await saveWaveFile ( fileName , audioBuffer ); } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \ -H "x-goog-api-key: \ No newline at end of file diff --git a/docstore/7a0cba4b-7434-4626-97ef-49797f989f90 b/docstore/7a0cba4b-7434-4626-97ef-49797f989f90 new file mode 100644 index 0000000000000000000000000000000000000000..01ae62c8740ecd40460af64187bff6feef8cdae5 --- /dev/null +++ b/docstore/7a0cba4b-7434-4626-97ef-49797f989f90 @@ -0,0 +1 @@ +are not available in OpenAI models but can be enabled using the extra_body field. extra_body features safety_settings Corresponds to Gemini's SafetySetting . cached_content Corresponds to Gemini's GenerateContentRequest.cached_content . thinking_config Corresponds to Gemini's ThinkingConfig . cached_content Here's an example of using extra_body to set cached_content : Python from openai import OpenAI client = OpenAI ( api_key = MY_API_KEY , base_url = "https://generativelanguage.googleapis.com/v1beta/" ) stream = client . chat . completions . create ( model = "gemini-2.5-pro" , n = 1 , messages = [ { "role" : "user" , "content" : "Summarize the video" } ], stream = True , stream_options = { 'include_usage' : True }, extra_body = { 'extra_body' : { 'google' : { 'cached_content' : "cachedContents/0000aaaa1111bbbb2222cccc3333dddd4444eeee" } } } ) for chunk in stream : print ( chunk ) print ( chunk . usage . to_dict ()) List models Get a list of available Gemini models: Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) models = client . models . list () for model in models : print ( model . id ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); async function main () { const list = await openai . models . list (); for await ( const model of list ) { console . log ( model ); } } main (); REST curl https://generativelanguage.googleapis.com/v1beta/openai/models \ -H "Authorization: Bearer GEMINI_API_KEY" Retrieve a model Retrieve a Gemini model: Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) model = client . models . retrieve ( "gemini-2.0-flash" ) print ( model . id ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : \ No newline at end of file diff --git a/docstore/7a0f2b8f-5bfe-4032-bb2f-780bee616245 b/docstore/7a0f2b8f-5bfe-4032-bb2f-780bee616245 new file mode 100644 index 0000000000000000000000000000000000000000..34fafa88bef1190b729bdf255b8c99cfcd7b08b1 --- /dev/null +++ b/docstore/7a0f2b8f-5bfe-4032-bb2f-780bee616245 @@ -0,0 +1 @@ +Use descriptive language : Use adjectives and adverbs to paint a clear picture for Veo. Provide context : If necessary, include background information to help your model understand what you want. Reference specific artistic styles : If you have a particular aesthetic in mind, reference specific artistic styles or art movements. Utilize prompt engineering tools : Consider exploring prompt engineering tools or resources to help you refine your prompts and achieve optimal results. For more information, visit Introduction to prompt design . Enhance the facial details in your personal and group images : Specify facial details as a focus of the photo like using the word portrait in the prompt. Example prompts and output This section presents several prompts, highlighting how descriptive details can elevate the outcome of each video. Icicles This video demonstrates how you can use the elements of prompt writing basics in your prompt. Prompt Generated output Close up shot (composition) of melting icicles (subject) on a frozen rock wall (context) with cool blue tones (ambiance), zoomed in (camera motion) maintaining close-up detail of water drips (action). Man on the phone These videos demonstrate how you can revise your prompt with increasingly specific details to get Veo to refine the output to your liking. Prompt Generated output Analysis The camera dollies to show a close up of a desperate man in a green trench coat. He's making a call on a rotary-style wall phone with a green neon light. It looks like a movie scene. This is the first generated video based on the prompt. A close-up cinematic shot follows a desperate man in a weathered green trench coat as he dials a rotary phone mounted on a gritty brick wall, bathed in the eerie glow of a green neon sign. The camera dollies in, revealing the tension in his jaw and the desperation etched on his face as he struggles to make the call. The shallow depth of field focuses on his furrowed brow and the black rotary phone, \ No newline at end of file diff --git a/docstore/7a31cfc5-be9b-4eb6-8249-0f4c33278537 b/docstore/7a31cfc5-be9b-4eb6-8249-0f4c33278537 new file mode 100644 index 0000000000000000000000000000000000000000..13654164ec6e7048af241ec2efbc514257c312b5 --- /dev/null +++ b/docstore/7a31cfc5-be9b-4eb6-8249-0f4c33278537 @@ -0,0 +1 @@ +For alternative methods of providing images and more advanced image processing, see our image understanding guide . The API also supports document , video , and audio inputs and understanding. Streaming responses By default, the model returns a response only after the entire generation process is complete. For more fluid interactions, use streaming to receive GenerateContentResponse instances incrementally as they're generated. Python from google import genai client = genai . Client () response = client . models . generate_content_stream ( model = "gemini-2.5-flash" , contents = [ "Explain how AI works" ] ) for chunk in response : print ( chunk . text , end = "" ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , }); for await ( const chunk of response ) { console . log ( chunk . text ); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } stream := client . Models . GenerateContentStream ( ctx , "gemini-2.5-flash" , genai . Text ( "Write a story about a magic backpack." ), nil , ) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt=sse" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ --no-buffer \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main \ No newline at end of file diff --git a/docstore/7a3e6b83-cc13-46f9-ac96-375bc0bb6aef b/docstore/7a3e6b83-cc13-46f9-ac96-375bc0bb6aef new file mode 100644 index 0000000000000000000000000000000000000000..b1044b06e974ef70df5275060bd78c27b49af935 --- /dev/null +++ b/docstore/7a3e6b83-cc13-46f9-ac96-375bc0bb6aef @@ -0,0 +1 @@ +ordering of the examples is not consistent with the property ordering of the schema, the output could be rambling or unexpected. To ensure a consistent, predictable ordering of properties, you can use the optional propertyOrdering[] field. "propertyOrdering" : [ "recipeName" , "ingredients" ] propertyOrdering[] – not a standard field in the OpenAPI specification – is an array of strings used to determine the order of properties in the response. By specifying the order of properties and then providing examples with properties in that same order, you can potentially improve the quality of results. propertyOrdering is only supported when you manually create types.Schema . Schemas in Python When you're using the Python library, the value of response_schema must be one of the following: A type, as you would use in a type annotation (see the Python typing module ) An instance of genai.types.Schema The dict equivalent of genai.types.Schema The easiest way to define a schema is with a Pydantic type (as shown in the previous example): Python config = { 'response_mime_type' : 'application/json' , 'response_schema' : list [ Recipe ]} When you use a Pydantic type, the Python library builds out a JSON schema for you and sends it to the API. For additional examples, see the Python library docs . The Python library supports schemas defined with the following types (where AllowedType is any allowed type): int float bool str list[AllowedType] AllowedType|AllowedType|... For structured types: dict[str, AllowedType] . This annotation declares all dict values to be the same type, but doesn't specify what keys should be included. User-defined Pydantic models . This approach lets you specify the key names and define different types for the values associated with each of the keys, including nested structures. JSON Schema support JSON Schema is a more recent specification than OpenAPI 3.0, which the Schema object is based on. Support for JSON Schema is available as a preview using the \ No newline at end of file diff --git a/docstore/7a4d4d1c-867e-4821-9a36-54a12ab98fd8 b/docstore/7a4d4d1c-867e-4821-9a36-54a12ab98fd8 new file mode 100644 index 0000000000000000000000000000000000000000..a93f111d87cca3d0226fc4cdf96775703aa03950 --- /dev/null +++ b/docstore/7a4d4d1c-867e-4821-9a36-54a12ab98fd8 @@ -0,0 +1 @@ +genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMAGE_PATH = "path/to/sample.jpg" MIME_TYPE = $( file -b --mime-type " ${ IMAGE_PATH } " ) NUM_BYTES = $( wc -c < " ${ IMAGE_PATH } " ) DISPLAY_NAME = IMAGE tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Caption this image."}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Prompting with multiple images You can provide multiple images in a \ No newline at end of file diff --git a/docstore/7a737b4c-b7dd-4a41-906a-da26e135741b b/docstore/7a737b4c-b7dd-4a41-906a-da26e135741b new file mode 100644 index 0000000000000000000000000000000000000000..1928fbda4690570381db2fc0734d5c40f27390c8 --- /dev/null +++ b/docstore/7a737b4c-b7dd-4a41-906a-da26e135741b @@ -0,0 +1 @@ +Part { InlineData : & genai . Blob { MIMEType : "audio/mp3" , Data : audioBytes , }, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } A few things to keep in mind about inline audio data: The maximum request size is 20 MB, which includes text prompts, system instructions, and files provided inline. If your file's size will make the total request size exceed 20 MB, then use the Files API to upload an audio file for use in the request. If you're using an audio sample multiple times, it's more efficient to upload an audio file . Get a transcript To get a transcript of audio data, just ask for it in the prompt: Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) prompt = 'Generate a transcript of the speech.' response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ prompt , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Generate a transcript of the speech." , ]), }); console . log ( "result.text=" , result . text ); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } localAudioPath := "/path/to/sample.mp3" uploadedFile , _ := client . Files . UploadFromPath ( ctx , localAudioPath , nil , ) parts := [] * genai . Part { genai . NewPartFromText ( "Generate a transcript of the speech." ), \ No newline at end of file diff --git a/docstore/7a761b6d-11b6-4ce6-925e-016a9b52f15b b/docstore/7a761b6d-11b6-4ce6-925e-016a9b52f15b new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/7a761b6d-11b6-4ce6-925e-016a9b52f15b @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/7a76d989-1b06-4f26-bcbb-d83f5e885519 b/docstore/7a76d989-1b06-4f26-bcbb-d83f5e885519 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/7a76d989-1b06-4f26-bcbb-d83f5e885519 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/7a8a89d4-c39b-408a-b1a2-3c13e6351785 b/docstore/7a8a89d4-c39b-408a-b1a2-3c13e6351785 new file mode 100644 index 0000000000000000000000000000000000000000..02741f017de0a4f2326222e7e2c63ce436f783ef --- /dev/null +++ b/docstore/7a8a89d4-c39b-408a-b1a2-3c13e6351785 @@ -0,0 +1 @@ +unsafe prompt." ), config , ) if err != nil { log . Fatal ( err ) } fmt . Println ( response . Text ()) } JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const safetySettings = [ { category : "HARM_CATEGORY_HARASSMENT" , threshold : "BLOCK_LOW_AND_ABOVE" , }, { category : "HARM_CATEGORY_HATE_SPEECH" , threshold : "BLOCK_LOW_AND_ABOVE" , }, ]; async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Some potentially unsafe prompt." , config : { safetySettings : safetySettings , }, }); console . log ( response . text ); } await main (); Dart (Flutter) final safetySettings = [ SafetySetting ( HarmCategory . harassment , HarmBlockThreshold . low ), SafetySetting ( HarmCategory . hateSpeech , HarmBlockThreshold . low ), ]; final model = GenerativeModel ( model: 'gemini-1.5-flash' , apiKey: apiKey , safetySettings: safetySettings , ); Kotlin val harassmentSafety = SafetySetting ( HarmCategory . HARASSMENT , BlockThreshold . LOW_AND_ABOVE ) val hateSpeechSafety = SafetySetting ( HarmCategory . HATE_SPEECH , BlockThreshold . LOW_AND_ABOVE ) val generativeModel = GenerativeModel ( modelName = "gemini-1.5-flash" , apiKey = BuildConfig . apiKey , safetySettings = listOf ( harassmentSafety , hateSpeechSafety ) ) Java SafetySetting harassmentSafety = new SafetySetting ( HarmCategory . HARASSMENT , BlockThreshold . LOW_AND_ABOVE ); SafetySetting hateSpeechSafety = new SafetySetting ( HarmCategory . HATE_SPEECH , BlockThreshold . LOW_AND_ABOVE ); GenerativeModel gm = new GenerativeModel ( "gemini-1.5-flash" , BuildConfig . apiKey , null , // generation config is optional Arrays . asList ( harassmentSafety , hateSpeechSafety ) ); GenerativeModelFutures model = GenerativeModelFutures . from ( gm ); REST echo '{ "safetySettings": [ {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_ONLY_HIGH"}, {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": \ No newline at end of file diff --git a/docstore/7a9e86b0-918c-4768-ab5b-803bfbc85916 b/docstore/7a9e86b0-918c-4768-ab5b-803bfbc85916 new file mode 100644 index 0000000000000000000000000000000000000000..750217269b5a3e53dd2d92de822b68418a6799df --- /dev/null +++ b/docstore/7a9e86b0-918c-4768-ab5b-803bfbc85916 @@ -0,0 +1 @@ +' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Get metadata for a file You can verify that the API successfully stored the uploaded file and get its metadata by calling files.get . Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) file_name = myfile . name myfile = client . files . get ( name = file_name ) print ( myfile ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; const fetchedFile = await ai . files . get ({ name : fileName }); console . log ( fetchedFile ); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } gotFile , err := client . GetFile ( ctx , file . Name ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Got file:" , gotFile . Name ) REST # file_info.json was created in the upload example name = $( jq ".file.name" file_info.json ) # Get the file of interest to check state curl https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " > file_info.json # Print some information about the file you got name = $( jq ".file.name" file_info.json ) echo name = $name file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri List uploaded files You can upload multiple files using the Files API. The following code gets a list of all the files uploaded: Python print ( 'My files:' ) for f in client . files . list (): print ( ' ' , f . name ) JavaScript const listResponse = await ai . files . list ({ config : { pageSize : 10 } }); for await ( const file of listResponse ) { console . log ( file . name ); } Go iter := client . ListFiles ( ctx ) for { ifile , err := iter . Next () if err == iterator . Done { break } if err != nil { log . Fatal ( err ) } fmt . Println ( ifile . Name ) } REST echo "My files: " curl \ No newline at end of file diff --git a/docstore/7aad1b47-d340-4b64-8d02-08b1e14e0a29 b/docstore/7aad1b47-d340-4b64-8d02-08b1e14e0a29 new file mode 100644 index 0000000000000000000000000000000000000000..465170b0f695eefaa56d7f3871f4f3df2424da62 --- /dev/null +++ b/docstore/7aad1b47-d340-4b64-8d02-08b1e14e0a29 @@ -0,0 +1 @@ +brightness color_temp: Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. Returns: A dictionary containing the set brightness and color temperature. """ return { "brightness" : brightness , "colorTemperature" : color_temp } JavaScript import { Type } from '@google/genai' ; // Define a function that the model can call to control smart lights const setLightValuesFunctionDeclaration = { name : 'set_light_values' , description : 'Sets the brightness and color temperature of a light.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'Light level from 0 to 100. Zero is off and 100 is full brightness' , }, color_temp : { type : Type . STRING , enum : [ 'daylight' , 'cool' , 'warm' ], description : 'Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`.' , }, }, required : [ 'brightness' , 'color_temp' ], }, }; /** * Set the brightness and color temperature of a room light. (mock API) * @param {number} brightness - Light level from 0 to 100. Zero is off and 100 is full brightness * @param {string} color_temp - Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. * @return {Object} A dictionary containing the set brightness and color temperature. */ function setLightValues ( brightness , color_temp ) { return { brightness : brightness , colorTemperature : color_temp }; } Step 2: Call the model with function declarations Once you have defined your function declarations, you can prompt the model to use them. It analyzes the prompt and function declarations and decides whether to respond directly or to call a function. If a function is called, the response object will contain a function call suggestion. Python from google.genai import types # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ set_light_values_declaration ]) config = types . GenerateContentConfig ( tools = [ \ No newline at end of file diff --git a/docstore/7abd2611-85cd-4350-8536-17852be327fe b/docstore/7abd2611-85cd-4350-8536-17852be327fe new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/7abd2611-85cd-4350-8536-17852be327fe @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/7ae8e787-0c5b-44cf-958f-c33e657419e2 b/docstore/7ae8e787-0c5b-44cf-958f-c33e657419e2 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/7ae8e787-0c5b-44cf-958f-c33e657419e2 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/7af556e3-560f-424d-bd78-98af6b7b7c21 b/docstore/7af556e3-560f-424d-bd78-98af6b7b7c21 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/7af556e3-560f-424d-bd78-98af6b7b7c21 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/7b167946-51a0-4010-9dcd-60d6b5d9f54a b/docstore/7b167946-51a0-4010-9dcd-60d6b5d9f54a new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/7b167946-51a0-4010-9dcd-60d6b5d9f54a @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/7b30d7eb-b0ba-44c2-a17c-3775cd39b7fa b/docstore/7b30d7eb-b0ba-44c2-a17c-3775cd39b7fa new file mode 100644 index 0000000000000000000000000000000000000000..007f635c2d64ef42082bdbd7a31da59c854333a3 --- /dev/null +++ b/docstore/7b30d7eb-b0ba-44c2-a17c-3775cd39b7fa @@ -0,0 +1 @@ +SILENT } ) JavaScript import { GoogleGenAI , Modality , Behavior , FunctionResponseScheduling } from '@google/genai' ; // for a non-blocking function definition, apply scheduling in the function response: const functionResponse = { id : fc . id , name : fc . name , response : { result : "ok" , scheduling : FunctionResponseScheduling . INTERRUPT // Can also be WHEN_IDLE or SILENT } } Code execution You can define code execution as part of the session configuration. This lets the Live API generate and execute Python code and dynamically perform computations to benefit your results. See the Code execution tutorial to learn more. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" tools = [{ 'code_execution' : {}}] config = { "response_modalities" : [ "TEXT" ], "tools" : tools } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : prompt = "Compute the largest prime palindrome under 100000." await session . send_client_content ( turns = { "parts" : [{ "text" : prompt }]}) async for chunk in session . receive (): if chunk . server_content : if chunk . text is not None : print ( chunk . text ) model_turn = chunk . server_content . model_turn if model_turn : for part in model_turn . parts : if part . executable_code is not None : print ( part . executable_code . code ) if part . code_execution_result is not None : print ( part . code_execution_result . output ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const tools = [{ codeExecution : {}}] const config = { responseModalities : [ Modality . TEXT ], tools : tools } async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = \ No newline at end of file diff --git a/docstore/7b337f04-d9ca-411f-94f8-6bc18104617b b/docstore/7b337f04-d9ca-411f-94f8-6bc18104617b new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/7b337f04-d9ca-411f-94f8-6bc18104617b @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/7b3ab832-a33f-44d3-8cfb-a7810f35e424 b/docstore/7b3ab832-a33f-44d3-8cfb-a7810f35e424 new file mode 100644 index 0000000000000000000000000000000000000000..65337d81cbf9fba76eb8d44ddc68611350b61de7 --- /dev/null +++ b/docstore/7b3ab832-a33f-44d3-8cfb-a7810f35e424 @@ -0,0 +1 @@ += lambda s : s . segment . end_index , reverse = True ) for support in sorted_supports : end_index = support . segment . end_index if support . grounding_chunk_indices : # Create citation string like [1](link1)[2](link2) citation_links = [] for i in support . grounding_chunk_indices : if i < len ( chunks ): uri = chunks [ i ] . web . uri citation_links . append ( f "[ { i + 1 } ]( { uri } )" ) citation_string = ", " . join ( citation_links ) text = text [: end_index ] + citation_string + text [ end_index :] return text # Assuming response with grounding metadata text_with_citations = add_citations ( response ) print ( text_with_citations ) JavaScript function addCitations ( response ) { let text = response . text ; const supports = response . candidates [ 0 ] ? . groundingMetadata ? . groundingSupports ; const chunks = response . candidates [ 0 ] ? . groundingMetadata ? . groundingChunks ; // Sort supports by end_index in descending order to avoid shifting issues when inserting. const sortedSupports = [... supports ]. sort ( ( a , b ) = > ( b . segment ? . endIndex ?? 0 ) - ( a . segment ? . endIndex ?? 0 ), ); for ( const support of sortedSupports ) { const endIndex = support . segment ? . endIndex ; if ( endIndex === undefined || ! support . groundingChunkIndices ? . length ) { continue ; } const citationLinks = support . groundingChunkIndices . map ( i = > { const uri = chunks [ i ] ? . web ? . uri ; if ( uri ) { return `[ ${ i + 1 } ]( ${ uri } )` ; } return null ; }) . filter ( Boolean ); if ( citationLinks . length > 0 ) { const citationString = citationLinks . join ( ", " ); text = text . slice ( 0 , endIndex ) + citationString + text . slice ( endIndex ); } } return text ; } const textWithCitations = addCitations ( response ); console . log ( textWithCitations ); The new response with inline citations will look like this: Spain won Euro 2024, defeating England 2-1 in the final.[1](https:/...), [2](https:/...), [4](https:/...), [5](https:/...) This victory \ No newline at end of file diff --git a/docstore/7b662773-8e2f-47f9-931a-ad2a7ab0754b b/docstore/7b662773-8e2f-47f9-931a-ad2a7ab0754b new file mode 100644 index 0000000000000000000000000000000000000000..c7d99b48acdb29ebe1cdd75df52d7215dd4d0ab1 --- /dev/null +++ b/docstore/7b662773-8e2f-47f9-931a-ad2a7ab0754b @@ -0,0 +1 @@ +- Zsh Zsh is a common Linux and macOS terminal configuration. You can check if you have a configuration file for it by running the following command: ~/.zshrc If the response is "No such file or directory", you will need to create this file and open it by running the following commands, or use bash : touch ~/.zshrc open ~/.zshrc Next, you need to set your API key by adding the following export command: export GEMINI_API_KEY = After saving the file, apply the changes by running: source ~/.zshrc Windows Search for "Environment Variables" in the system settings Edit either "User variables" (for current user) or "System variables" (for all users - use with caution). Create the variable and add export GEMINI_API_KEY=your_key_here Apply the changes Providing API key explicitly In some cases, you may want to explicitly provide an API key. For example: You're doing a simple API call and prefer hard coding the API key. You want explicit control without having to rely on automatic discovery of environment variables by the Gemini API libraries You're using an environment where environment variables are not supported (e.g web) or you are making REST calls. Below are examples for how you can provide an API key explicitly: Python from google import genai client = genai . Client ( api_key = " YOUR_API_KEY " ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Explain how AI works in a few words" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : " YOUR_API_KEY " }); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works in a few words" , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , & genai . \ No newline at end of file diff --git a/docstore/7b6d1e58-3c06-4525-ae3d-5ad481531794 b/docstore/7b6d1e58-3c06-4525-ae3d-5ad481531794 new file mode 100644 index 0000000000000000000000000000000000000000..9e5553cba5f3d4fd2f4057c580231dd55eeaff8e --- /dev/null +++ b/docstore/7b6d1e58-3c06-4525-ae3d-5ad481531794 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting-intro#main-content Title: Prompt design strategies | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/7b71fcac-a4a7-421c-af3e-abc72fa8ec52 b/docstore/7b71fcac-a4a7-421c-af3e-abc72fa8ec52 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/7b71fcac-a4a7-421c-af3e-abc72fa8ec52 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/7b7e4d94-3a63-40f4-b3c5-c45f439ab8e5 b/docstore/7b7e4d94-3a63-40f4-b3c5-c45f439ab8e5 new file mode 100644 index 0000000000000000000000000000000000000000..e652ebcdf342b29a27305f6af4427b0dbb03d3f1 --- /dev/null +++ b/docstore/7b7e4d94-3a63-40f4-b3c5-c45f439ab8e5 @@ -0,0 +1 @@ +"gemini-2.5-flash" , contents = "What's the temperature in London?" , config = config , ) # Check for a function call if response . candidates [ 0 ] . content . parts [ 0 ] . function_call : function_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call print ( f "Function to call: { function_call . name } " ) print ( f "Arguments: { function_call . args } " ) # In a real app, you would call your function here: # result = get_current_temperature(**function_call.args) else : print ( "No function call found in the response." ) print ( response . text ) JavaScript import { GoogleGenAI , Type } from '@google/genai' ; // Configure the client const ai = new GoogleGenAI ({}); // Define the function declaration for the model const weatherFunctionDeclaration = { name : 'get_current_temperature' , description : 'Gets the current temperature for a given location.' , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , description : 'The city name, e.g. San Francisco' , }, }, required : [ 'location' ], }, }; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : "What's the temperature in London?" , config : { tools : [{ functionDeclarations : [ weatherFunctionDeclaration ] }], }, }); // Check for function calls in the response if ( response . functionCalls && response . functionCalls . length > 0 ) { const functionCall = response . functionCalls [ 0 ]; // Assuming one function call console . log ( `Function to call: ${ functionCall . name } ` ); console . log ( `Arguments: ${ JSON . stringify ( functionCall . args ) } ` ); // In a real app, you would call your actual function here: // const result = await getCurrentTemperature(functionCall.args); } else { console . log ( "No function call found in the response." ); console . log ( response . text ); } REST curl \ No newline at end of file diff --git a/docstore/7b9ae82b-ba5c-4524-a966-09ce2f5527b7 b/docstore/7b9ae82b-ba5c-4524-a966-09ce2f5527b7 new file mode 100644 index 0000000000000000000000000000000000000000..3f35d7c2ee0452cbbcb055812399e279fb8f7031 --- /dev/null +++ b/docstore/7b9ae82b-ba5c-4524-a966-09ce2f5527b7 @@ -0,0 +1 @@ +$GEMINI_API_KEY " \ -X POST \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts":[{ "text": "TTS the following conversation between Joe and Jane: Joe: Hows it going today Jane? Jane: Not too bad, how about you?" }] }], "generationConfig": { "responseModalities": ["AUDIO"], "speechConfig": { "multiSpeakerVoiceConfig": { "speakerVoiceConfigs": [{ "speaker": "Joe", "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Kore" } } }, { "speaker": "Jane", "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Puck" } } }] } } }, "model": "gemini-2.5-flash-preview-tts", }' | jq -r '.candidates[0].content.parts[0].inlineData.data' | \ base64 --decode > out.pcm # You may need to install ffmpeg. ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm out.wav Controlling speech style with prompts You can control style, tone, accent, and pace using natural language prompts for both single- and multi-speaker TTS. For example, in a single-speaker prompt, you can say: Say in an spooky whisper: "By the pricking of my thumbs... Something wicked this way comes" In a multi-speaker prompt, provide the model with each speaker's name and corresponding transcript. You can also provide guidance for each speaker individually: Make Speaker1 sound tired and bored, and Speaker2 sound excited and happy: Speaker1: So... what's on the agenda today? Speaker2: You're never going to guess! Try using a voice option that corresponds to the style or emotion you want to convey, to emphasize it even more. In the previous prompt, for example, Enceladus 's breathiness might emphasize "tired" and "bored", while Puck 's upbeat tone could complement "excited" and "happy". Generating a prompt to convert to audio The TTS models only output audio, but you can use other models to generate a transcript first, then pass that transcript to the TTS model to read aloud. Python from google import genai from google.genai import types client = genai . Client () transcript = client . models . generate_content ( model \ No newline at end of file diff --git a/docstore/7bb490dc-9c41-47fa-8fed-8816dcfedc51 b/docstore/7bb490dc-9c41-47fa-8fed-8816dcfedc51 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/7bb490dc-9c41-47fa-8fed-8816dcfedc51 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/7bcbbaa7-216d-40d7-881c-30ec0a7e5090 b/docstore/7bcbbaa7-216d-40d7-881c-30ec0a7e5090 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/7bcbbaa7-216d-40d7-881c-30ec0a7e5090 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/7bd300eb-29ef-437e-8a2f-9ebe327721a7 b/docstore/7bd300eb-29ef-437e-8a2f-9ebe327721a7 new file mode 100644 index 0000000000000000000000000000000000000000..1c3c1b9b46e1c38e34dd8cd82807f79c808d7249 --- /dev/null +++ b/docstore/7bd300eb-29ef-437e-8a2f-9ebe327721a7 @@ -0,0 +1 @@ +sketches, to hyper-realistic digital art. For example, the following images use the same prompt with different styles: "An [art style or creation technique] of an angular sporty electric sedan with skyscrapers in the background" Prompt: A technical pencil drawing of an angular... Prompt: A charcoal drawing of an angular... Prompt: A color pencil drawing of an angular... Prompt: A pastel painting of an angular... Prompt: A digital art of an angular... Prompt: An art deco (poster) of an angular... Image source: Each image was generated using its corresponding text prompt with the Imagen 2 model. Shapes and materials Prompt includes: "...made of..." , "...in the shape of..." One of the strengths of this technology is that you can create imagery that is otherwise difficult or impossible. For example, you can recreate your company logo in different materials and textures. Prompt: a duffle bag made of cheese Prompt: neon tubes in the shape of a bird Prompt: an armchair made of paper , studio photo, origami style Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Historical art references Prompt includes: "...in the style of..." Certain styles have become iconic over the years. The following are some ideas of historical painting or art styles that you can try. "generate an image in the style of [art period or movement] : a wind farm" Prompt: generate an image in the style of an impressionist painting : a wind farm Prompt: generate an image in the style of a renaissance painting : a wind farm Prompt: generate an image in the style of pop art : a wind farm Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Image quality modifiers Certain keywords can let the model know that you're looking for a high-quality asset. Examples of quality modifiers include the following: General Modifiers - high-quality, beautiful, stylized Photos - 4K, HDR, Studio Photo Art, Illustration - by a \ No newline at end of file diff --git a/docstore/7c060d3c-cca9-4a79-89c1-50b6ff54088e b/docstore/7c060d3c-cca9-4a79-89c1-50b6ff54088e new file mode 100644 index 0000000000000000000000000000000000000000..90311e83465fc930174e1787a7822757d628fc0a --- /dev/null +++ b/docstore/7c060d3c-cca9-4a79-89c1-50b6ff54088e @@ -0,0 +1 @@ +setLightValues ( tool_call . args . brightness , tool_call . args . color_temp ); console . log ( `Function execution result: ${ JSON . stringify ( result ) } ` ); } Step 4: Create user friendly response with function result and call the model again Finally, send the result of the function execution back to the model so it can incorporate this information into its final response to the user. Python # Create a function response part function_response_part = types . Part . from_function_response ( name = tool_call . name , response = { "result" : result }, ) # Append function call and result of the function execution to contents contents . append ( response . candidates [ 0 ] . content ) # Append the content from the model's response. contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Create a function response part const function_response_part = { name : tool_call . name , response : { result } } // Append function call and result of the function execution to contents contents . push ( response . candidates [ 0 ]. content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); // Get the final response from the model const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); This completes the function calling flow. The model successfully used the set_light_values function to perform the request action of the user. Function declarations When you implement function calling in a prompt, you create a tools object, which contains one or more function declarations . You define functions using JSON, specifically with a select subset of the OpenAPI schema format. A single function \ No newline at end of file diff --git a/docstore/7c1488d5-0366-4710-8e2b-040d41c7ace4 b/docstore/7c1488d5-0366-4710-8e2b-040d41c7ace4 new file mode 100644 index 0000000000000000000000000000000000000000..e8472db3f1b37b3dc2d13dd06406d0e42fc75dfb --- /dev/null +++ b/docstore/7c1488d5-0366-4710-8e2b-040d41c7ace4 @@ -0,0 +1 @@ +costing 258 tokens. Tips and best practices Verify that images are correctly rotated. Use clear, non-blurry images. When using a single image with text, place the text prompt after the image part in the contents array. What's next This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources: Files API : Learn more about uploading and managing files for use with Gemini. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/7c1af78f-fc14-4623-82c9-e36037b4703e b/docstore/7c1af78f-fc14-4623-82c9-e36037b4703e new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/7c1af78f-fc14-4623-82c9-e36037b4703e @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/7c227e0d-93b8-42c8-b248-6bf2c1c76055 b/docstore/7c227e0d-93b8-42c8-b248-6bf2c1c76055 new file mode 100644 index 0000000000000000000000000000000000000000..43d235dee43f472c269052714c9705874463b9cd --- /dev/null +++ b/docstore/7c227e0d-93b8-42c8-b248-6bf2c1c76055 @@ -0,0 +1 @@ +tools ]) # Define user prompt contents = [ types . Content ( role = "user" , parts = [ types . Part ( text = "Turn the lights down to a romantic level" )] ) ] # Send request with function declarations response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = contents config = config , ) print ( response . candidates [ 0 ] . content . parts [ 0 ] . function_call ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Generation config with function declaration const config = { tools : [{ functionDeclarations : [ setLightValuesFunctionDeclaration ] }] }; // Configure the client const ai = new GoogleGenAI ({}); // Define user prompt const contents = [ { role : 'user' , parts : [{ text : 'Turn the lights down to a romantic level' }] } ]; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( response . functionCalls [ 0 ]); The model then returns a functionCall object in an OpenAPI compatible schema specifying how to call one or more of the declared functions in order to respond to the user's question. Python id = None args = { 'color_temp' : 'warm' , 'brightness' : 25 } name = 'set_light_values' JavaScript { name : 'set_light_values' , args : { brightness : 25 , color_temp : 'warm' } } Step 3: Execute set_light_values function code Extract the function call details from the model's response, parse the arguments , and execute the set_light_values function. Python # Extract tool call details, it may not be in the first part. tool_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call if tool_call . name == "set_light_values" : result = set_light_values ( ** tool_call . args ) print ( f "Function execution result: { result } " ) JavaScript // Extract tool call details const tool_call = response . functionCalls [ 0 ] let result ; if ( tool_call . name === 'set_light_values' ) { result = \ No newline at end of file diff --git a/docstore/7c32e423-c4bc-4f6c-be0d-e3a15ca4a518 b/docstore/7c32e423-c4bc-4f6c-be0d-e3a15ca4a518 new file mode 100644 index 0000000000000000000000000000000000000000..3d241d4ff001a9868e67072728eae208556221c7 --- /dev/null +++ b/docstore/7c32e423-c4bc-4f6c-be0d-e3a15ca4a518 @@ -0,0 +1 @@ +"https://vertexaisearch.cloud.google.com/grounding-api-redirect/1234567890abcdef" , "url_retrieval_status" : < UrlRe tr ievalS tatus .URL_RETRIEVAL_STATUS_SUCCESS : "URL_RETRIEVAL_STATUS_SUCCESS" > }, { "retrieved_url" : "https://vertexaisearch.cloud.google.com/grounding-api-redirect/abcdef1234567890" , "url_retrieval_status" : < UrlRe tr ievalS tatus .URL_RETRIEVAL_STATUS_SUCCESS : "URL_RETRIEVAL_STATUS_SUCCESS" > }, { "retrieved_url" : " YOUR_URL " , "url_retrieval_status" : < UrlRe tr ievalS tatus .URL_RETRIEVAL_STATUS_SUCCESS : "URL_RETRIEVAL_STATUS_SUCCESS" > }, { "retrieved_url" : "https://vertexaisearch.cloud.google.com/grounding-api-redirect/fedcba0987654321" , "url_retrieval_status" : < UrlRe tr ievalS tatus .URL_RETRIEVAL_STATUS_SUCCESS : "URL_RETRIEVAL_STATUS_SUCCESS" > } ] } } } Supported models gemini-2.5-pro gemini-2.5-flash gemini-2.5-flash-lite gemini-2.0-flash gemini-2.0-flash-live-001 Limitations The tool will consume up to 20 URLs per request for analysis. For best results during experimental phase, use the tool on standard web pages rather than multimedia content such as YouTube videos. During experimental phase, the tool is free to use. Billing to come later. The experimental release has the following quotas: 1500 queries per day per project for requests made through the Gemini API 100 queries per day per user in Google AI Studio Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/7c341bd1-38f7-4529-a153-fdf602b19e77 b/docstore/7c341bd1-38f7-4529-a153-fdf602b19e77 new file mode 100644 index 0000000000000000000000000000000000000000..b0571e28c8e74f7e3e23139b08c0865b24edbd38 --- /dev/null +++ b/docstore/7c341bd1-38f7-4529-a153-fdf602b19e77 @@ -0,0 +1 @@ +And you can also pass the schema as JSON: Python from google import genai client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = 'What type of instrument is an oboe?' , config = { 'response_mime_type' : 'text/x.enum' , 'response_schema' : { "type" : "STRING" , "enum" : [ "Percussion" , "String" , "Woodwind" , "Brass" , "Keyboard" ], }, }, ) print ( response . text ) # Woodwind Beyond basic multiple choice problems, you can use an enum anywhere in a JSON schema. For example, you could ask the model for a list of recipe titles and use a Grade enum to give each title a popularity grade: Python from google import genai import enum from pydantic import BaseModel class Grade ( enum . Enum ): A_PLUS = "a+" A = "a" B = "b" C = "c" D = "d" F = "f" class Recipe ( BaseModel ): recipe_name : str rating : Grade client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = 'List 10 home-baked cookie recipes and give them grades based on tastiness.' , config = { 'response_mime_type' : 'application/json' , 'response_schema' : list [ Recipe ], }, ) print ( response . text ) The response might look like this: [ { "recipe_name" : "Chocolate Chip Cookies" , "rating" : "a+" }, { "recipe_name" : "Peanut Butter Cookies" , "rating" : "a" }, { "recipe_name" : "Oatmeal Raisin Cookies" , "rating" : "b" }, ... ] About JSON schemas Configuring the model for JSON output using responseSchema parameter relies on Schema object to define its structure. This object represents a select subset of the OpenAPI 3.0 Schema object , and also adds a propertyOrdering field. Tip: On Python, when you use a Pydantic model, you don't need to directly work with Schema objects, as it gets automatically converted to the corresponding JSON schema. To learn more, see JSON schemas in Python . Here's a pseudo-JSON representation of all the Schema fields: { "type": enum (Type), "format": string, "description": \ No newline at end of file diff --git a/docstore/7c4513a0-fcec-403c-a7cb-b1c7bd540fe4 b/docstore/7c4513a0-fcec-403c-a7cb-b1c7bd540fe4 new file mode 100644 index 0000000000000000000000000000000000000000..d028e190ded4732dff341fe647fe7532c210272b --- /dev/null +++ b/docstore/7c4513a0-fcec-403c-a7cb-b1c7bd540fe4 @@ -0,0 +1 @@ +second image shows an empty hallway in a shopping mall, which is a place where people often go to buy things for their trips. The third image shows a stack of toilet paper, which is something that people often use when they are traveling. The fourth image shows a dark and stormy sea, which is something that people often encounter when they are traveling. The same tips work for creative output from images. In the following example with the rabbit, explicitly asking the model to include aspects from the image can improve the output. Prompt Model response Can you write me a poem about this image? Little bunny, so soft and brown, Hopping through the field, up and down. Your ears so long, your tail so small, You're the cutest creature of them all. You nibble on the grass, so green and sweet, And hop away when you see me. But I'll always be your friend, Little bunny, until the very end. Updated prompt Improved response Can you write me a descriptive and dramatic poem about this image and include the location? Little bunny on the green, Ears so long, so keen. What do you see out there, In the distance, where you stare? Is it the ocean blue, Sparkling in the sun's hue? Or the waves that crash and roar, A rhythm that you adore? Little bunny, tell me true, What captures your attention so? Is it the salty breeze, Or the seagulls' cries that please? Perhaps it's the vastness of it all, A world so big and small. A place where dreams can roam, And adventures can be sown. Little bunny, I wonder what you dream, As you sit on the grass, so serene. Do you yearn to explore the deep, Or stay on land, where you can leap? Whatever it may be, little bunny, Keep that spark of wonder burning brightly. For in your dreams and aspirations, Lies a world waiting for your creation. Troubleshooting which part of the prompt failed It can be hard to know whether a prompt failed because the model didn't understand the image to begin with, or if it did understand the image but did not perform the \ No newline at end of file diff --git a/docstore/7c464038-3e37-4bae-9f62-95818f07bca2 b/docstore/7c464038-3e37-4bae-9f62-95818f07bca2 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/7c464038-3e37-4bae-9f62-95818f07bca2 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/7c496e0e-b2ee-41d4-95b7-66c6b5510b1d b/docstore/7c496e0e-b2ee-41d4-95b7-66c6b5510b1d new file mode 100644 index 0000000000000000000000000000000000000000..5e9121d1be57e8a1aaab0fbf3e894c8df7f2ac33 --- /dev/null +++ b/docstore/7c496e0e-b2ee-41d4-95b7-66c6b5510b1d @@ -0,0 +1 @@ +Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/docstore/7c6c296e-c080-41d8-860d-5f65a872393b b/docstore/7c6c296e-c080-41d8-860d-5f65a872393b new file mode 100644 index 0000000000000000000000000000000000000000..1c3c1b9b46e1c38e34dd8cd82807f79c808d7249 --- /dev/null +++ b/docstore/7c6c296e-c080-41d8-860d-5f65a872393b @@ -0,0 +1 @@ +sketches, to hyper-realistic digital art. For example, the following images use the same prompt with different styles: "An [art style or creation technique] of an angular sporty electric sedan with skyscrapers in the background" Prompt: A technical pencil drawing of an angular... Prompt: A charcoal drawing of an angular... Prompt: A color pencil drawing of an angular... Prompt: A pastel painting of an angular... Prompt: A digital art of an angular... Prompt: An art deco (poster) of an angular... Image source: Each image was generated using its corresponding text prompt with the Imagen 2 model. Shapes and materials Prompt includes: "...made of..." , "...in the shape of..." One of the strengths of this technology is that you can create imagery that is otherwise difficult or impossible. For example, you can recreate your company logo in different materials and textures. Prompt: a duffle bag made of cheese Prompt: neon tubes in the shape of a bird Prompt: an armchair made of paper , studio photo, origami style Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Historical art references Prompt includes: "...in the style of..." Certain styles have become iconic over the years. The following are some ideas of historical painting or art styles that you can try. "generate an image in the style of [art period or movement] : a wind farm" Prompt: generate an image in the style of an impressionist painting : a wind farm Prompt: generate an image in the style of a renaissance painting : a wind farm Prompt: generate an image in the style of pop art : a wind farm Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Image quality modifiers Certain keywords can let the model know that you're looking for a high-quality asset. Examples of quality modifiers include the following: General Modifiers - high-quality, beautiful, stylized Photos - 4K, HDR, Studio Photo Art, Illustration - by a \ No newline at end of file diff --git a/docstore/7c7663d0-7226-4aa2-8d45-c01de77f206d b/docstore/7c7663d0-7226-4aa2-8d45-c01de77f206d new file mode 100644 index 0000000000000000000000000000000000000000..3c1b7a7f28e24faa230d65b7608a5733d7c64407 --- /dev/null +++ b/docstore/7c7663d0-7226-4aa2-8d45-c01de77f206d @@ -0,0 +1 @@ +single prompt by including multiple image Part objects in the contents array. These can be a mix of inline data (local files or URLs) and File API references. Python from google import genai from google.genai import types client = genai . Client () # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client . files . upload ( file = image1_path ) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open ( image2_path , 'rb' ) as f : img2_bytes = f . read () # Create the prompt with text and multiple images response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is different between these two images?" , uploaded_file , # Use the uploaded file reference types . Part . from_bytes ( data = img2_bytes , mime_type = 'image/png' ) ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); async function main () { // Upload the first image const image1_path = "path/to/image1.jpg" ; const uploadedFile = await ai . files . upload ({ file : image1_path , config : { mimeType : "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png" ; const base64Image2File = fs . readFileSync ( image2_path , { encoding : "base64" , }); // Create the prompt with text and multiple images const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ "What is different between these two images?" , createPartFromUri ( uploadedFile . uri , uploadedFile . mimeType ), { inlineData : { mimeType : "image/png" , data : base64Image2File , }, }, ]), }); console . log ( response . text ); } await main (); Go // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile , _ := client . Files . UploadFromPath ( ctx , image1Path , nil ) // Prepare the second image as inline \ No newline at end of file diff --git a/docstore/7c9b9f92-cfcf-4362-8613-a39cecba00cd b/docstore/7c9b9f92-cfcf-4362-8613-a39cecba00cd new file mode 100644 index 0000000000000000000000000000000000000000..b58c189db2b124c61872134bdd4b3d786a4e18e6 --- /dev/null +++ b/docstore/7c9b9f92-cfcf-4362-8613-a39cecba00cd @@ -0,0 +1 @@ +you can read about configuring function calling . Python from google import genai from google.genai import types # Configure the client and tools client = genai . Client () house_tools = [ types . Tool ( function_declarations = [ power_disco_ball , start_music , dim_lights ]) ] config = types . GenerateContentConfig ( tools = house_tools , automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ), # Force the model to call 'any' function, instead of chatting. tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = 'ANY' ) ), ) chat = client . chats . create ( model = "gemini-2.5-flash" , config = config ) response = chat . send_message ( "Turn this place into a party!" ) # Print out each of the function calls requested from this single call print ( "Example 1: Forced function calling" ) for fn in response . function_calls : args = ", " . join ( f " { key } = { val } " for key , val in fn . args . items ()) print ( f " { fn . name } ( { args } )" ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Set up function declarations const houseFns = [ powerDiscoBall , startMusic , dimLights ]; const config = { tools : [{ functionDeclarations : houseFns }], // Force the model to call 'any' function, instead of chatting. toolConfig : { functionCallingConfig : { mode : 'any' } } }; // Configure the client const ai = new GoogleGenAI ({}); // Create a chat session const chat = ai . chats . create ({ model : 'gemini-2.5-flash' , config : config }); const response = await chat . sendMessage ({ message : 'Turn this place into a party!' }); // Print out each of the function calls requested from this single call console . log ( "Example 1: Forced function calling" ); for ( const fn of response . functionCalls ) { const args = Object . entries ( fn . args ) . map (([ key , val ]) = > ` ${ key } = ${ val } ` ) . join ( ', ' ); console . log ( ` ${ fn . name } ( ${ args } )` ); } Each of the printed \ No newline at end of file diff --git a/docstore/7cb06273-6e78-4df4-8e43-6bab48d0663c b/docstore/7cb06273-6e78-4df4-8e43-6bab48d0663c new file mode 100644 index 0000000000000000000000000000000000000000..af04e93a60e88c829b3aa07a575f51b8b1fa9ccf --- /dev/null +++ b/docstore/7cb06273-6e78-4df4-8e43-6bab48d0663c @@ -0,0 +1 @@ +TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : false , // default startOfSpeechSensitivity : StartSensitivity . START_SENSITIVITY_LOW , endOfSpeechSensitivity : EndSensitivity . END_SENSITIVITY_LOW , prefixPaddingMs : 20 , silenceDurationMs : 100 , } } }; Disable automatic VAD Alternatively, the automatic VAD can be disabled by setting realtimeInputConfig.automaticActivityDetection.disabled to true in the setup message. In this configuration the client is responsible for detecting user speech and sending activityStart and activityEnd messages at the appropriate times. An audioStreamEnd isn't sent in this configuration. Instead, any interruption of the stream is marked by an activityEnd message. Python config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : True }}, } async with client . aio . live . connect ( model = model , config = config ) as session : # ... await session . send_realtime_input ( activity_start = types . ActivityStart ()) await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) await session . send_realtime_input ( activity_end = types . ActivityEnd ()) # ... JavaScript const config = { responseModalities : [ Modality . TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : true , } } }; session . sendRealtimeInput ({ activityStart : {} }) session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); session . sendRealtimeInput ({ activityEnd : {} }) Token count You can find the total number of consumed tokens in the usageMetadata field of the returned server message. Python async for message in session . receive (): # The server will periodically send messages that include UsageMetadata. if message . usage_metadata : usage = message . usage_metadata print ( f "Used { usage . total_token_count } tokens in total. Response token \ No newline at end of file diff --git a/docstore/7cb67ec5-261d-4350-ae9c-7e521385afe8 b/docstore/7cb67ec5-261d-4350-ae9c-7e521385afe8 new file mode 100644 index 0000000000000000000000000000000000000000..3d93e98c2388d03b02165eba8a61d6ed193d8415 --- /dev/null +++ b/docstore/7cb67ec5-261d-4350-ae9c-7e521385afe8 @@ -0,0 +1 @@ +Prompt design strategies | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Prompt design strategies Prompt design is the process of creating prompts, or natural language requests, that elicit accurate, high quality responses from a language model. This page introduces basic concepts, strategies, and best practices to get you started designing prompts to get the most out of Gemini AI models. Topic-specific prompt guides Looking for more specific prompt strategies? Check out our other prompting guides on: Prompting with media files Prompting for image generation Prompting for video generation Google AI Studio also hosts a prompt gallery meant to interactively showcase many of the concepts shared in this guide. Clear and specific instructions An effective and efficient way to customize model behavior is to provide it with clear and specific instructions. Instructions can be in the form of a question, step-by-step tasks, or as complex as mapping out a user's experience and mindset. Input Input is the required text in the prompt that you want the model to provide a response to. Inputs can be a question that the model answers (question input), a task the model performs (task input), an entity the model operates on (entity input), or partial input that the model completes or continues (completion input). Input type Prompt Generated output Question What's a good name for a flower shop that specializes in selling bouquets of dried flowers? Create a list of 5 options with just the names. Here are 10 names for a flower shop specializing in dried flowers: 1. Everlasting Blooms 2. Dried & Delightful 3. The Petal Preserve 4. Whispers of Wildflowers \ No newline at end of file diff --git a/docstore/7cc2fba9-d75f-402f-9e02-d76319963263 b/docstore/7cc2fba9-d75f-402f-9e02-d76319963263 new file mode 100644 index 0000000000000000000000000000000000000000..9b6431ae9c97257d5ff4628dd401f203e2f83eb3 --- /dev/null +++ b/docstore/7cc2fba9-d75f-402f-9e02-d76319963263 @@ -0,0 +1 @@ +Use curl to send a POST request to the predictLongRunning endpoint. # The request body includes the prompt for video generation. curl " ${ BASE_URL } /models/veo-2.0-generate-001:predictLongRunning" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -X "POST" \ -d '{ "instances": [{ "prompt": "Panning wide shot of a calico kitten sleeping in the sunshine" } ], "parameters": { "aspectRatio": "16:9", "personGeneration": "dont_allow", } }' | tee result.json | jq .name | sed 's/"//g' > op_name # Obtain operation name to download video. op_name = $( cat op_name ) # Check against status of operation. while true ; do is_done = $( curl -H "x-goog-api-key: $GEMINI_API_KEY " " ${ BASE_URL } / ${ op_name } " | tee op_check.json | jq .done ) if [ " ${ is_done } " = "true" ] ; then cat op_check.json echo "** Attach API_KEY to download video, or examine error message." break fi echo "** Video ${ op_name } has not downloaded yet! Check again after 5 seconds..." # Wait for 5 seoncds to check again. sleep 5 done This code takes about 2-3 minutes to run, though it may take longer if resources are constrained. Once it's done running, you should see a video that looks something like this: If you see an error message instead of a video, this means that resources are constrained and your request couldn't be completed. In this case, run the code again. Generated videos are stored on the server for 2 days, after which they are removed. If you want to save a local copy of your generated video, you must run result() and save() within 2 days of generation. Generate from images You can also generate videos using images. The following code generates an image using Imagen, then uses the generated image as the starting frame for the generated video. First, generate an image using Imagen : Python prompt = "Panning wide shot of a calico kitten sleeping in the sunshine" , imagen = client . models . generate_images ( model = "imagen-3.0-generate-002" , prompt = prompt , \ No newline at end of file diff --git a/docstore/7ccb8e3a-69b2-4ecf-ab9a-d97a9c58510d b/docstore/7ccb8e3a-69b2-4ecf-ab9a-d97a9c58510d new file mode 100644 index 0000000000000000000000000000000000000000..3f35d7c2ee0452cbbcb055812399e279fb8f7031 --- /dev/null +++ b/docstore/7ccb8e3a-69b2-4ecf-ab9a-d97a9c58510d @@ -0,0 +1 @@ +$GEMINI_API_KEY " \ -X POST \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts":[{ "text": "TTS the following conversation between Joe and Jane: Joe: Hows it going today Jane? Jane: Not too bad, how about you?" }] }], "generationConfig": { "responseModalities": ["AUDIO"], "speechConfig": { "multiSpeakerVoiceConfig": { "speakerVoiceConfigs": [{ "speaker": "Joe", "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Kore" } } }, { "speaker": "Jane", "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Puck" } } }] } } }, "model": "gemini-2.5-flash-preview-tts", }' | jq -r '.candidates[0].content.parts[0].inlineData.data' | \ base64 --decode > out.pcm # You may need to install ffmpeg. ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm out.wav Controlling speech style with prompts You can control style, tone, accent, and pace using natural language prompts for both single- and multi-speaker TTS. For example, in a single-speaker prompt, you can say: Say in an spooky whisper: "By the pricking of my thumbs... Something wicked this way comes" In a multi-speaker prompt, provide the model with each speaker's name and corresponding transcript. You can also provide guidance for each speaker individually: Make Speaker1 sound tired and bored, and Speaker2 sound excited and happy: Speaker1: So... what's on the agenda today? Speaker2: You're never going to guess! Try using a voice option that corresponds to the style or emotion you want to convey, to emphasize it even more. In the previous prompt, for example, Enceladus 's breathiness might emphasize "tired" and "bored", while Puck 's upbeat tone could complement "excited" and "happy". Generating a prompt to convert to audio The TTS models only output audio, but you can use other models to generate a transcript first, then pass that transcript to the TTS model to read aloud. Python from google import genai from google.genai import types client = genai . Client () transcript = client . models . generate_content ( model \ No newline at end of file diff --git a/docstore/7cd6c981-e14f-4f10-9c98-a8c9c253ea01 b/docstore/7cd6c981-e14f-4f10-9c98-a8c9c253ea01 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/7cd6c981-e14f-4f10-9c98-a8c9c253ea01 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/7ce8b246-e54d-4613-b10d-bccb5abf28b8 b/docstore/7ce8b246-e54d-4613-b10d-bccb5abf28b8 new file mode 100644 index 0000000000000000000000000000000000000000..6ba79c2c7031e707d89e2b23470a5d9ee375cc5c --- /dev/null +++ b/docstore/7ce8b246-e54d-4613-b10d-bccb5abf28b8 @@ -0,0 +1 @@ +mcp.client.stdio import stdio_client from google import genai client = genai . Client () # Create server parameters for stdio connection server_params = StdioServerParameters ( command = "npx" , # Executable args = [ "-y" , "@philschmid/weather-mcp" ], # MCP Server env = None , # Optional environment variables ) async def run (): async with stdio_client ( server_params ) as ( read , write ): async with ClientSession ( read , write ) as session : # Prompt to get the weather for the current day in London. prompt = f "What is the weather in London in { datetime . now () . strftime ( '%Y-%m- %d ' ) } ?" # Initialize the connection between client and server await session . initialize () # Send request to the model with MCP function declarations response = await client . aio . models . generate_content ( model = "gemini-2.5-flash" , contents = prompt , config = genai . types . GenerateContentConfig ( temperature = 0 , tools = [ session ], # uses the session, will automatically call the tool # Uncomment if you **don't** want the SDK to automatically call the tool # automatic_function_calling=genai.types.AutomaticFunctionCallingConfig( # disable=True # ), ), ) print ( response . text ) # Start the asyncio event loop and run the main function asyncio . run ( run ()) JavaScript Make sure the latest version of the mcp SDK is installed on your platform of choice. npm install @modelcontextprotocol/sdk Note: JavaScript supports automatic tool calling by wrapping the client with mcpToTool . If you want to disable it, you can provide automaticFunctionCalling with disabled true . import { GoogleGenAI , FunctionCallingConfigMode , mcpToTool } from '@google/genai' ; import { Client } from "@modelcontextprotocol/sdk/client/index.js" ; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js" ; // Create server parameters for stdio connection const serverParams = new StdioClientTransport ({ command : "npx" , // Executable args : [ "-y" , "@philschmid/weather-mcp" \ No newline at end of file diff --git a/docstore/7ceee243-6782-4d5a-a029-3b1c2a168db5 b/docstore/7ceee243-6782-4d5a-a029-3b1c2a168db5 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/7ceee243-6782-4d5a-a029-3b1c2a168db5 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/7cfdb4e1-ed4b-4c62-8c72-6153c48de56f b/docstore/7cfdb4e1-ed4b-4c62-8c72-6153c48de56f new file mode 100644 index 0000000000000000000000000000000000000000..7e8ad5dab4abf30afa5f2a7a1bb2bb58fcb2f5d0 --- /dev/null +++ b/docstore/7cfdb4e1-ed4b-4c62-8c72-6153c48de56f @@ -0,0 +1 @@ +-X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"' " $MIME_TYPE " '", "data": "' " $IMAGE_B64 " '" } }, {"text": "Caption this image."} ] }] }' 2 > /dev/null Note: Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly. Uploading images using the File API For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to generateContent . See the Files API guide for more information and examples. Python from google import genai client = genai . Client () my_file = client . files . upload ( file = "path/to/sample.jpg" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ my_file , "Caption this image." ], ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.jpg" , config : { mimeType : "image/jpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Caption this image." , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.jpg" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Caption this image." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * \ No newline at end of file diff --git a/docstore/7d0aa47b-a48d-4cfa-b357-e2318987e1f9 b/docstore/7d0aa47b-a48d-4cfa-b357-e2318987e1f9 new file mode 100644 index 0000000000000000000000000000000000000000..2dd19cc1c2d77863f4a301d62456871308dd74fd --- /dev/null +++ b/docstore/7d0aa47b-a48d-4cfa-b357-e2318987e1f9 @@ -0,0 +1 @@ +"temperature" ], }, }, ], }, ]; // Prompt for the model let contents = [ { role : "user" , parts : [ { text : "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , }, ], }, ]; // Loop until the model has no more function calls to make while ( true ) { const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents , config : { tools }, }); if ( result . functionCalls && result . functionCalls . length > 0 ) { const functionCall = result . functionCalls [ 0 ]; const { name , args } = functionCall ; if ( ! toolFunctions [ name ]) { throw new Error ( `Unknown function call: ${ name } ` ); } // Call the function and get the response. const toolResponse = toolFunctions [ name ]( args ); const functionResponsePart = { name : functionCall . name , response : { result : toolResponse , }, }; // Send the function response back to the model. contents . push ({ role : "model" , parts : [ { functionCall : functionCall , }, ], }); contents . push ({ role : "user" , parts : [ { functionResponse : functionResponsePart , }, ], }); } else { // No more function calls, break the loop. console . log ( result . text ); break ; } } Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . It 's 25°C in London, so I' ve set the thermostat to 20 ° C . Compositional function calling is a native Live API feature. This means Live API can handle the function calling similar to the Python SDK. Python # Light control schemas turn_on_the_lights_schema = { 'name' : 'turn_on_the_lights' } \ No newline at end of file diff --git a/docstore/7d2c6617-f6ca-463b-8032-43ac9bede8b4 b/docstore/7d2c6617-f6ca-463b-8032-43ac9bede8b4 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/7d2c6617-f6ca-463b-8032-43ac9bede8b4 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/7d2e04a5-5ecf-4246-95e2-774f4c7871da b/docstore/7d2e04a5-5ecf-4246-95e2-774f4c7871da new file mode 100644 index 0000000000000000000000000000000000000000..5e9121d1be57e8a1aaab0fbf3e894c8df7f2ac33 --- /dev/null +++ b/docstore/7d2e04a5-5ecf-4246-95e2-774f4c7871da @@ -0,0 +1 @@ +Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/docstore/7d3fbc0f-5916-461f-adb0-a2a41f3a7f11 b/docstore/7d3fbc0f-5916-461f-adb0-a2a41f3a7f11 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/7d3fbc0f-5916-461f-adb0-a2a41f3a7f11 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/7d4d5852-a7c4-4935-a506-97b3a54bd8d4 b/docstore/7d4d5852-a7c4-4935-a506-97b3a54bd8d4 new file mode 100644 index 0000000000000000000000000000000000000000..a3f65441166ab1d39d6f723c5ce4c04b54ac95e6 --- /dev/null +++ b/docstore/7d4d5852-a7c4-4935-a506-97b3a54bd8d4 @@ -0,0 +1 @@ +values, use an enum. Tool Selection: While the model can use an arbitrary number of tools, providing too many can increase the risk of selecting an incorrect or suboptimal tool. For best results, aim to provide only the relevant tools for the context or task, ideally keeping the active set to a maximum of 10-20. Consider dynamic tool selection based on conversation context if you have a large total number of tools. Prompt Engineering: Provide context: Tell the model its role (e.g., "You are a helpful weather assistant."). Give instructions: Specify how and when to use functions (e.g., "Don't guess dates; always use a future date for forecasts."). Encourage clarification: Instruct the model to ask clarifying questions if needed. Temperature: Use a low temperature (e.g., 0) for more deterministic and reliable function calls. Validation: If a function call has significant consequences (e.g., placing an order), validate the call with the user before executing it. Error Handling : Implement robust error handling in your functions to gracefully handle unexpected inputs or API failures. Return informative error messages that the model can use to generate helpful responses to the user. Security: Be mindful of security when calling external APIs. Use appropriate authentication and authorization mechanisms. Avoid exposing sensitive data in function calls. Token Limits: Function descriptions and parameters count towards your input token limit. If you're hitting token limits, consider limiting the number of functions or the length of the descriptions, break down complex tasks into smaller, more focused function sets. Notes and limitations Only a subset of the OpenAPI schema is supported. Supported parameter types in Python are limited. Automatic function calling is a Python SDK feature only. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License \ No newline at end of file diff --git a/docstore/7d579ddc-d044-437a-ad76-38d206853fd3 b/docstore/7d579ddc-d044-437a-ad76-38d206853fd3 new file mode 100644 index 0000000000000000000000000000000000000000..8450328bfc49bb62b82cf9cb2ac9ca979a8f4eef --- /dev/null +++ b/docstore/7d579ddc-d044-437a-ad76-38d206853fd3 @@ -0,0 +1 @@ +Python import asyncio import wave from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : wf = wave . open ( "audio.wav" , "wb" ) wf . setnchannels ( 1 ) wf . setsampwidth ( 2 ) wf . setframerate ( 24000 ) message = "Hello how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . data is not None : wf . writeframes ( response . data ) # Un-comment this code to print audio data info # if response.server_content.model_turn is not None: # print(response.server_content.model_turn.parts[0].inline_data.mime_type) wf . close () if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : \ No newline at end of file diff --git a/docstore/7d8a7c86-727c-4a30-b759-81ee9d023e4c b/docstore/7d8a7c86-727c-4a30-b759-81ee9d023e4c new file mode 100644 index 0000000000000000000000000000000000000000..9a3ae8e54d036eb9d08cf51953b4e3479c03ffae --- /dev/null +++ b/docstore/7d8a7c86-727c-4a30-b759-81ee9d023e4c @@ -0,0 +1 @@ +Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/7d9aaf0f-ec6f-4d82-abb1-c24af878781c b/docstore/7d9aaf0f-ec6f-4d82-abb1-c24af878781c new file mode 100644 index 0000000000000000000000000000000000000000..acf59327e1e166ed3c09727e15204d3ee29ea7c7 --- /dev/null +++ b/docstore/7d9aaf0f-ec6f-4d82-abb1-c24af878781c @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models#gemini-1.5-flash Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/7da2f090-9f0a-4e1b-96d2-859ab78232b1 b/docstore/7da2f090-9f0a-4e1b-96d2-859ab78232b1 new file mode 100644 index 0000000000000000000000000000000000000000..da790334610df2eb1eb6ac9bb30354d4f1eafe8b --- /dev/null +++ b/docstore/7da2f090-9f0a-4e1b-96d2-859ab78232b1 @@ -0,0 +1 @@ +the output you want. Adding these examples can help the model identify the patterns and apply the relationship between the given images and responses to the new example. This is also called "few-shot" learning. In the example below, the initial output is written in sentence form, and also contains the country (Brazil). Suppose you want the output in a different format or style, and you want only the city, not the country. Adding few-shot examples to your prompt can steer the model to respond in the way you want. Prompt Model response Determine the city along with the landmark. The landmark is the Christ the Redeemer statue in Rio de Janeiro, Brazil. Updated prompt Improved response Determine the city along with the landmark. city: Rome, landmark: the Colosseum. city: Beijing, landmark: Forbidden City city: Rio de Janeiro, landmark: Christ the Redeemer statue Break it down step-by-step For complex tasks like the ones that require both visual understanding and reasoning, it can be helpful to split the task into smaller, more straightforward steps. Alternatively, it could also be effective if you directly ask the model to “think step by step” in your prompt. Prompt Model response When will I run out of toilet paper? Soon, you only have 3 rolls left. Updated prompt Improved response 1. First, count how many toilet paper rolls are in this picture. 2. Then, determine how much toilet paper a typical person uses per day. 3. Calculate how long these rolls of toilet paper will last. 1. There are 3 rolls of toilet paper in this picture. 2. A typical person uses about 20 sheets of toilet paper per day. 3. If each roll contains 200 sheets, then each roll will last for about 10 days. Therefore, the 3 rolls will last for about a month. Math problems or other types of word problems are great candidates for asking the model to think step-by-step. Prompt Response What is the 4th term in the sequence? -135 The response from the model is incorrect. Some ways to improve this is to ask \ No newline at end of file diff --git a/docstore/7db9b266-71c9-4d29-b37d-d3d7db171252 b/docstore/7db9b266-71c9-4d29-b37d-d3d7db171252 new file mode 100644 index 0000000000000000000000000000000000000000..bebef94a2670f1eebe768f4cb4680db515bfe743 --- /dev/null +++ b/docstore/7db9b266-71c9-4d29-b37d-d3d7db171252 @@ -0,0 +1 @@ +it's recommended to provide a single message summary to free up the context window for subsequent interactions. See Session Resumption for another method for loading session context. Sending and receiving audio The most common audio example, audio-to-audio , is covered in the Getting started guide. Here's an audio-to-text example that reads a WAV file, sends it in the correct format and receives text output: Python # Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav # Install helpers for converting files: pip install librosa soundfile import asyncio import io from pathlib import Path from google import genai from google.genai import types import soundfile as sf import librosa client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : buffer = io . BytesIO () y , sr = librosa . load ( "sample.wav" , sr = 16000 ) sf . write ( buffer , y , sr , format = 'RAW' , subtype = 'PCM_16' ) buffer . seek ( 0 ) audio_bytes = buffer . read () # If already in correct format, you can use this: # audio_bytes = Path("sample.pcm").read_bytes() await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav // Install helpers for converting files: npm install wavefile import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue \ No newline at end of file diff --git a/docstore/7dbd5652-fe66-402b-bd54-f3915b063fbe b/docstore/7dbd5652-fe66-402b-bd54-f3915b063fbe new file mode 100644 index 0000000000000000000000000000000000000000..b0d24ed8267a7db2d3f856003571a245204928ff --- /dev/null +++ b/docstore/7dbd5652-fe66-402b-bd54-f3915b063fbe @@ -0,0 +1 @@ +voice name from the prebuilt output voices . This example saves the output audio from the model in a wave file: Python from google import genai from google.genai import types import wave # Set up the wave file to save the output: def wave_file ( filename , pcm , channels = 1 , rate = 24000 , sample_width = 2 ): with wave . open ( filename , "wb" ) as wf : wf . setnchannels ( channels ) wf . setsampwidth ( sample_width ) wf . setframerate ( rate ) wf . writeframes ( pcm ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = "Say cheerfully: Have a wonderful day!" , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), ) ) data = response . candidates [ 0 ] . content . parts [ 0 ] . inline_data . data file_name = 'out.wav' wave_file ( file_name , data ) # Saves the file to current directory For more code samples, refer to the "TTS - Get Started" file in the cookbooks repository: View on GitHub JavaScript import { GoogleGenAI } from '@google/genai' ; import wav from 'wav' ; async function saveWaveFile ( filename , pcmData , channels = 1 , rate = 24000 , sampleWidth = 2 , ) { return new Promise (( resolve , reject ) = > { const writer = new wav . FileWriter ( filename , { channels , sampleRate : rate , bitDepth : sampleWidth * 8 , }); writer . on ( 'finish' , resolve ); writer . on ( 'error' , reject ); writer . write ( pcmData ); writer . end (); }); } async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : [{ parts : [{ text : 'Say cheerfully: Have a wonderful day!' }] }], config : { responseModalities : [ 'AUDIO' ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Kore' }, }, }, }, }); \ No newline at end of file diff --git a/docstore/7dc7bfdb-28c5-49aa-8cab-56a431b74a39 b/docstore/7dc7bfdb-28c5-49aa-8cab-56a431b74a39 new file mode 100644 index 0000000000000000000000000000000000000000..6acfe7ba02c229c83724a8d4179d85fae6727e02 --- /dev/null +++ b/docstore/7dc7bfdb-28c5-49aa-8cab-56a431b74a39 @@ -0,0 +1 @@ +images before performing its reasoning task. Prompt Model response What is in common between these images? All four images are examples of landscape photography. Updated prompt Improved response First, describe what's in each image in detail. What's in common between these images? The first image shows a person packing a suitcase. The second image shows an empty hallway in a shopping mall. The third image shows a roll of toilet paper. The fourth image shows a dark and stormy sea. The commonality between these images is that they are all examples of liminal spaces. Liminal spaces are places or situations that are in between two other places or situations. They are often characterized by a sense of uncertainty or ambiguity. The first image shows the person in between being at home and being on a trip. The second image shows the hallway in between two stores. The third image is of toilet paper, which can represent a bathroom. A bathroom is a transitional space between private and public realms. It's a place where we prepare ourselves to enter or leave the outside world. The fourth image shows the sea in between being calm and being stormy. Liminal spaces can be both exciting and unsettling. They can be a place of new beginning or a place of endings. They can be a place of reflection or a place of action. You can also try explicitly asking the model to refer to what's in the image. Updated prompt Improved response What is in common between these images? Refer to what's in the images in your response. All of the images are related to travel. The first image shows a person packing a suitcase, the second image shows an empty hallway in a shopping mall, the third image shows a stack of toilet paper, and the fourth image shows a dark and stormy sea. These images are all related to travel because they are all things that people need or use when they travel. The first image shows a person packing a suitcase, which is something that people do when they are going on a trip. The \ No newline at end of file diff --git a/docstore/7dc8c265-19bf-405b-899b-1bf82798c3f9 b/docstore/7dc8c265-19bf-405b-899b-1bf82798c3f9 new file mode 100644 index 0000000000000000000000000000000000000000..d22ece2b55b32ea1d25de4357312c4fc8586a587 --- /dev/null +++ b/docstore/7dc8c265-19bf-405b-899b-1bf82798c3f9 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#gemini-1.5-pro Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/7dc9558b-14c8-4747-8693-15563f79b985 b/docstore/7dc9558b-14c8-4747-8693-15563f79b985 new file mode 100644 index 0000000000000000000000000000000000000000..53f73acff00b9d698b5b684be4346e1d720dab05 --- /dev/null +++ b/docstore/7dc9558b-14c8-4747-8693-15563f79b985 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#live-api Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/7dde9317-884a-46dd-9089-9015dd2fbc80 b/docstore/7dde9317-884a-46dd-9089-9015dd2fbc80 new file mode 100644 index 0000000000000000000000000000000000000000..9dd2717ee06a5cb6666e2976e86c3492640fe1f5 --- /dev/null +++ b/docstore/7dde9317-884a-46dd-9089-9015dd2fbc80 @@ -0,0 +1 @@ +'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Refer to the GenerateContentConfig in our API reference for a complete list of configurable parameters and their descriptions. Multimodal inputs The Gemini API supports multimodal inputs, allowing you to combine text with media files. The following example demonstrates providing an image: Python from PIL import Image from google import genai client = genai . Client () image = Image . open ( "/path/to/organ.png" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , "Tell me about this instrument" ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const image = await ai . files . upload ({ file : "/path/to/organ.png" , }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ createUserContent ([ "Tell me about this instrument" , createPartFromUri ( image . uri , image . mimeType ), ]), ], }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imagePath := "/path/to/organ.jpg" imgData , _ := os . ReadFile ( imagePath ) parts := [] * genai . Part { genai . NewPartFromText ( "Tell me about this instrument" ), & genai . Part { InlineData : & genai . Blob { MIMEType : "image/jpeg" , Data : imgData , \ No newline at end of file diff --git a/docstore/7dea988b-1018-4cbe-95c5-a2e8f9bf6a34 b/docstore/7dea988b-1018-4cbe-95c5-a2e8f9bf6a34 new file mode 100644 index 0000000000000000000000000000000000000000..da790334610df2eb1eb6ac9bb30354d4f1eafe8b --- /dev/null +++ b/docstore/7dea988b-1018-4cbe-95c5-a2e8f9bf6a34 @@ -0,0 +1 @@ +the output you want. Adding these examples can help the model identify the patterns and apply the relationship between the given images and responses to the new example. This is also called "few-shot" learning. In the example below, the initial output is written in sentence form, and also contains the country (Brazil). Suppose you want the output in a different format or style, and you want only the city, not the country. Adding few-shot examples to your prompt can steer the model to respond in the way you want. Prompt Model response Determine the city along with the landmark. The landmark is the Christ the Redeemer statue in Rio de Janeiro, Brazil. Updated prompt Improved response Determine the city along with the landmark. city: Rome, landmark: the Colosseum. city: Beijing, landmark: Forbidden City city: Rio de Janeiro, landmark: Christ the Redeemer statue Break it down step-by-step For complex tasks like the ones that require both visual understanding and reasoning, it can be helpful to split the task into smaller, more straightforward steps. Alternatively, it could also be effective if you directly ask the model to “think step by step” in your prompt. Prompt Model response When will I run out of toilet paper? Soon, you only have 3 rolls left. Updated prompt Improved response 1. First, count how many toilet paper rolls are in this picture. 2. Then, determine how much toilet paper a typical person uses per day. 3. Calculate how long these rolls of toilet paper will last. 1. There are 3 rolls of toilet paper in this picture. 2. A typical person uses about 20 sheets of toilet paper per day. 3. If each roll contains 200 sheets, then each roll will last for about 10 days. Therefore, the 3 rolls will last for about a month. Math problems or other types of word problems are great candidates for asking the model to think step-by-step. Prompt Response What is the 4th term in the sequence? -135 The response from the model is incorrect. Some ways to improve this is to ask \ No newline at end of file diff --git a/docstore/7dfd9bd3-51a3-42b4-b53c-5ee796f48caa b/docstore/7dfd9bd3-51a3-42b4-b53c-5ee796f48caa new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/7dfd9bd3-51a3-42b4-b53c-5ee796f48caa @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/7e0883b1-c35b-4cb3-bdf5-828a689ec0a9 b/docstore/7e0883b1-c35b-4cb3-bdf5-828a689ec0a9 new file mode 100644 index 0000000000000000000000000000000000000000..b7772256110e1184cd9828af16aaf33fa099feda --- /dev/null +++ b/docstore/7e0883b1-c35b-4cb3-bdf5-828a689ec0a9 @@ -0,0 +1 @@ +enlisting people in 'red teams' to try and break your application. In automated testing, the 'red team' is another language model that finds input text that elicit harmful outputs from the model being tested. Note: LLMs are known to sometimes produce different outputs for the same input prompt. Multiple rounds of testing may be needed to catch more of the problematic outputs. Monitor for problems No matter how much you test and mitigate, you can never guarantee perfection, so plan upfront how you'll spot and deal with problems that arise. Common approaches include setting up a monitored channel for users to share feedback (e.g., thumbs up/down rating) and running a user study to proactively solicit feedback from a diverse mix of users — especially valuable if usage patterns are different to expectations. Advanced tips When users give feedback to AI products, it can greatly improve the AI performance and the user experience over time by, for example, helping you choose better examples for prompt tuning. The Feedback and Control chapter in Google's People and AI guidebook highlights key considerations to take into account when designing feedback mechanisms. Next steps Refer to the safety settings guide to learn about the adjustable safety settings available through the Gemini API. See the intro to prompting to get started writing your first prompts. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-02-25 UTC. \ No newline at end of file diff --git a/docstore/7e146f25-3fb0-45c3-9623-7a74b18839f3 b/docstore/7e146f25-3fb0-45c3-9623-7a74b18839f3 new file mode 100644 index 0000000000000000000000000000000000000000..8711f29609bdb3bc24fbc7c22d5a2b29ce171b6a --- /dev/null +++ b/docstore/7e146f25-3fb0-45c3-9623-7a74b18839f3 @@ -0,0 +1 @@ +affiliates. Last updated 2025-02-25 UTC. \ No newline at end of file diff --git a/docstore/7e1b7e9f-55b0-4d19-ad06-f471c068a0af b/docstore/7e1b7e9f-55b0-4d19-ad06-f471c068a0af new file mode 100644 index 0000000000000000000000000000000000000000..8562c6ca5d2a89dac90935227121a5fd486f1f09 --- /dev/null +++ b/docstore/7e1b7e9f-55b0-4d19-ad06-f471c068a0af @@ -0,0 +1 @@ +establishing your core idea, and then refine and expand upon that core idea until the generated image is close to your vision. Prompt: A park in the spring next to a lake Prompt: A park in the spring next to a lake, the sun sets across the lake, golden hour Prompt: A park in the spring next to a lake, the sun sets across the lake, golden hour, red wildflowers Imagen models can transform your ideas into detailed images, whether your prompts are short or long and detailed. Refine your vision through iterative prompting, adding details until you achieve the perfect result. Short prompts let you generate an image quickly. Prompt: close-up photo of a woman in her 20s, street photography, movie still, muted orange warm tones Longer prompts let you add specific details and build your image. Prompt: captivating photo of a woman in her 20s utilizing a street photography style. The image should look like a movie still with muted orange warm tones. Additional advice for Imagen prompt writing: Use descriptive language : Employ detailed adjectives and adverbs to paint a clear picture for Imagen. Provide context : If necessary, include background information to aid the AI's understanding. Reference specific artists or styles : If you have a particular aesthetic in mind, referencing specific artists or art movements can be helpful. Use prompt engineering tools : Consider exploring prompt engineering tools or resources to help you refine your prompts and achieve optimal results. Enhancing the facial details in your personal and group images : Specify facial details as a focus of the photo (for example, use the word "portrait" in the prompt). Generate text in images Imagen models can add text into images, opening up more creative image generation possibilities. Use the following guidance to get the most out of this feature: Iterate with confidence : You might have to regenerate images until you achieve the look you want. Imagen's text integration is still evolving, and sometimes \ No newline at end of file diff --git a/docstore/7e51df68-163d-4b27-b989-2108173a7d6d b/docstore/7e51df68-163d-4b27-b989-2108173a7d6d new file mode 100644 index 0000000000000000000000000000000000000000..70023e78d7b8c9459310371555beec5ba6328e28 --- /dev/null +++ b/docstore/7e51df68-163d-4b27-b989-2108173a7d6d @@ -0,0 +1 @@ +results reflects a single function call that the model has requested. To send the results back, include the responses in the same order as they were requested. The Python SDK supports automatic function calling , which automatically converts Python functions to declarations, handles the function call execution and response cycle for you. Following is an example for the disco use case. Note: Automatic Function Calling is a Python SDK only feature at the moment. Python from google import genai from google.genai import types # Actual function implementations def power_disco_ball_impl ( power : bool ) - > dict : """Powers the spinning disco ball. Args: power: Whether to turn the disco ball on or off. Returns: A status dictionary indicating the current state. """ return { "status" : f "Disco ball powered { 'on' if power else 'off' } " } def start_music_impl ( energetic : bool , loud : bool ) - > dict : """Play some music matching the specified parameters. Args: energetic: Whether the music is energetic or not. loud: Whether the music is loud or not. Returns: A dictionary containing the music settings. """ music_type = "energetic" if energetic else "chill" volume = "loud" if loud else "quiet" return { "music_type" : music_type , "volume" : volume } def dim_lights_impl ( brightness : float ) - > dict : """Dim the lights. Args: brightness: The brightness of the lights, 0.0 is off, 1.0 is full. Returns: A dictionary containing the new brightness setting. """ return { "brightness" : brightness } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ power_disco_ball_impl , start_music_impl , dim_lights_impl ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Do everything you need to this place into party!" , config = config , ) print ( " \n Example 2: Automatic function calling" ) print ( response . text ) # I've turned on the disco ball, started playing loud and \ No newline at end of file diff --git a/docstore/7e5c5526-c61c-4660-ace2-cbeed71b57d4 b/docstore/7e5c5526-c61c-4660-ace2-cbeed71b57d4 new file mode 100644 index 0000000000000000000000000000000000000000..3c1b7a7f28e24faa230d65b7608a5733d7c64407 --- /dev/null +++ b/docstore/7e5c5526-c61c-4660-ace2-cbeed71b57d4 @@ -0,0 +1 @@ +single prompt by including multiple image Part objects in the contents array. These can be a mix of inline data (local files or URLs) and File API references. Python from google import genai from google.genai import types client = genai . Client () # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client . files . upload ( file = image1_path ) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open ( image2_path , 'rb' ) as f : img2_bytes = f . read () # Create the prompt with text and multiple images response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is different between these two images?" , uploaded_file , # Use the uploaded file reference types . Part . from_bytes ( data = img2_bytes , mime_type = 'image/png' ) ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); async function main () { // Upload the first image const image1_path = "path/to/image1.jpg" ; const uploadedFile = await ai . files . upload ({ file : image1_path , config : { mimeType : "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png" ; const base64Image2File = fs . readFileSync ( image2_path , { encoding : "base64" , }); // Create the prompt with text and multiple images const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ "What is different between these two images?" , createPartFromUri ( uploadedFile . uri , uploadedFile . mimeType ), { inlineData : { mimeType : "image/png" , data : base64Image2File , }, }, ]), }); console . log ( response . text ); } await main (); Go // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile , _ := client . Files . UploadFromPath ( ctx , image1Path , nil ) // Prepare the second image as inline \ No newline at end of file diff --git a/docstore/7e655779-1087-4d13-af9f-b54739df3b11 b/docstore/7e655779-1087-4d13-af9f-b54739df3b11 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/7e655779-1087-4d13-af9f-b54739df3b11 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/7e7a23da-8d63-4bd5-8718-10e17215f5f5 b/docstore/7e7a23da-8d63-4bd5-8718-10e17215f5f5 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/7e7a23da-8d63-4bd5-8718-10e17215f5f5 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/7edd22c5-ee25-449b-b99e-a96f42e1ce3a b/docstore/7edd22c5-ee25-449b-b99e-a96f42e1ce3a new file mode 100644 index 0000000000000000000000000000000000000000..ed09bf86b4b3896290a2372bddef4006c085c60d --- /dev/null +++ b/docstore/7edd22c5-ee25-449b-b99e-a96f42e1ce3a @@ -0,0 +1 @@ +Image generation | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image generation You can generate images using the Gemini API with either Gemini's built-in multimodal capabilities or Imagen, Google's specialized image generation models. For most use cases, start with Gemini . Choose Imagen for specialized tasks where image quality is critical. See Choosing the right model section for more guidance. All generated images include a SynthID watermark . Before you begin Ensure you use a supported model and version for image generation: For Gemini , use Gemini 2.0 Flash Preview Image Generation. For Imagen , use one of the Imagen models (Imagen 3, Imagen 4 or Imagen 4 Ultra). Note that those models are only available on the Paid tier . You can access both Gemini and Imagen models using the same libraries. Note: Image generation may not be available in all regions and countries, review our Models page for more information. Generate images using Gemini Gemini can generate and process images conversationally. You can prompt Gemini with text, images, or a combination of both to achieve various image-related tasks, such as image generation and editing. You must include responseModalities : ["TEXT", "IMAGE"] in your configuration. Image-only output is not supported with these models. Image generation (text-to-image) The following code demonstrates how to generate an image based on a descriptive prompt: Python from google import genai from google.genai import types from PIL import Image from io import BytesIO import base64 client = genai . Client () contents = ( 'Hi, can you create a 3d rendered image of a pig ' 'with wings and a top hat flying \ No newline at end of file diff --git a/docstore/7ee4aaff-fabc-4b3f-afb1-9417549888a7 b/docstore/7ee4aaff-fabc-4b3f-afb1-9417549888a7 new file mode 100644 index 0000000000000000000000000000000000000000..cc967ffee72bfb803a27c22b45a7304a203928ac --- /dev/null +++ b/docstore/7ee4aaff-fabc-4b3f-afb1-9417549888a7 @@ -0,0 +1 @@ +the model to parse the problem before answering the prompt. Prompt Response Parse the formula in the image first. Then based on the formula, answer "what's the 4th term in the sequence?". Click to expand the result $$ b(n) = b(n-1) * (-3) $$ $$ b(1) = 15 $$ Substitute n = 4 into the formula: $$ b(4) = b(4-1) * (-3) $$ $$ b(4) = b(3) * (-3) $$ We don't know the value of b(3), so substitute n = 3 into the formula: $$ b(3) = b(3-1) * (-3) $$ $$ b(3) = b(2) * (-3) $$ We don't know the value of b(2), so substitute n = 2 into the formula: $$ b(2) = b(2-1) * (-3) $$ $$ b(2) = b(1) * (-3) $$ We know the value of b(1) is 15, so substitute it into the formula: $$ b(2) = 15 * (-3) $$ $$ b(2) = -45 $$ Now that we know the value of b(2), substitute it into the formula for b(3): $$ b(3) = b(2) * (-3) $$ $$ b(3) = -45 * (-3) $$ $$ b(3) = 135 $$ Now that we know the value of b(3), substitute it into the formula for b(4): $$ b(4) = b(3) * (-3) $$ $$ b(4) = 135 * (-3) $$ $$ b(4) = -405 $$ Therefore, the 4th term in the sequence is -405. This model response is correct - now to make the response even more streamlined, you can ask the model to to think step-by-step. Updated prompt Improved response What is the 4th term in the sequence? Think step by step. The first term is 15. The second term is b(1) * (-3) = 15 * (-3) = -45. The third term is b(2) * (-3) = -45 * (-3) = 135. The fourth term is b(3) * (-3) = 135 * (-3) = -405. Specify the output format A common problem is the need for model output to be in a certain format (e.g. markdown, JSON, HTML), especially if the model output needs to be ingested by a downstream task. You can try to produce output in that format by instructing the model to do so within the prompt. Below are two examples: Prompt Model response Parse the table in this image into markdown format | Subject | Status | Time | Start date | Due on | | ------- | ------ | ---- | ---------- | ------ | | Math | Done | 1.5 hrs | 11/29/2023 | 12/1/2023 | | Geography | Done | 30 \ No newline at end of file diff --git a/docstore/7f0c8079-6f47-47af-b733-b139b4014df2 b/docstore/7f0c8079-6f47-47af-b733-b139b4014df2 new file mode 100644 index 0000000000000000000000000000000000000000..3be596a615ed567f90abc0547a663aefd35b8061 --- /dev/null +++ b/docstore/7f0c8079-6f47-47af-b733-b139b4014df2 @@ -0,0 +1 @@ +:{}, 'temperature' : 0.7 , 'response_modalities' :[ 'TEXT' ] } }, 'http_options' : { 'api_version' : 'v1alpha' }, } ) # You'll need to pass the value under token.name back to your client to use it JavaScript import { GoogleGenAI } from "@google/genai" ; const client = new GoogleGenAI ({}); const expireTime = new Date ( Date . now () + 30 * 60 * 1000 ). toISOString (); const token = await client . authTokens . create ({ config : { uses : 1 , // The default expireTime : expireTime , liveConnectConstraints : { model : 'gemini-2.0-flash-live-001' , config : { sessionResumption : {}, temperature : 0.7 , responseModalities : [ 'TEXT' ] } }, httpOptions : { apiVersion : 'v1alpha' } } }); // You'll need to pass the value under token.name back to your client to use it You can also lock a subset of fields, see the SDK documentation for more info. Connect to Live API with an ephemeral token Once you have an ephemeral token, you use it as if it were an API key (but remember, it only works for the live API, and only with the v1alpha version of the API). Note that use of ephemeral tokens only adds value when deploying applications that follow client-to-server implementation approach. JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; // Use the token generated in the "Create an ephemeral token" section here const ai = new GoogleGenAI ({ apiKey : token . name }); const model = 'gemini-2.0-flash-live-001' ; const config = { responseModalities : [ Modality . TEXT ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : { ... }, }); // Send content... session . close (); } main (); Note: If not using the SDK, note that ephemeral tokens must either be passed in an access_token query parameter, or in an HTTP Authorization prefixed by the auth-scheme Token . See Get started with Live API for more examples. Best practices Set a short expiration duration using the expire_time parameter. Tokens expire, \ No newline at end of file diff --git a/docstore/7f452560-5a16-44c8-a106-ceddc3502a33 b/docstore/7f452560-5a16-44c8-a106-ceddc3502a33 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/7f452560-5a16-44c8-a106-ceddc3502a33 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/7f4f11e5-7c1f-4505-8e71-aa3a25010c9d b/docstore/7f4f11e5-7c1f-4505-8e71-aa3a25010c9d new file mode 100644 index 0000000000000000000000000000000000000000..6adba52776c85866a9f7e3e3060c0512ed2f447c --- /dev/null +++ b/docstore/7f4f11e5-7c1f-4505-8e71-aa3a25010c9d @@ -0,0 +1 @@ +about image prompting, see the Imagen prompt guide To learn about video prompting, see the Veo prompt guide Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-04-28 UTC. \ No newline at end of file diff --git a/docstore/7f65f9d7-ece3-4b6e-87ea-3e8f70ff459b b/docstore/7f65f9d7-ece3-4b6e-87ea-3e8f70ff459b new file mode 100644 index 0000000000000000000000000000000000000000..4ce25c45c0956235a2a76b8ce578fbaaad6010c8 --- /dev/null +++ b/docstore/7f65f9d7-ece3-4b6e-87ea-3e8f70ff459b @@ -0,0 +1 @@ +clip" , ]), }); console . log ( response . text ); } await main (); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } defer client . DeleteFile ( ctx , file . Name ) model := client . GenerativeModel ( "gemini-2.0-flash" ) resp , err := model . GenerateContent ( ctx , genai . FileData { URI : file . URI }, genai . Text ( "Describe this audio clip" )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) REST AUDIO_PATH = "path/to/sample.mp3" MIME_TYPE = $( file -b --mime-type " ${ AUDIO_PATH } " ) NUM_BYTES = $( wc -c < " ${ AUDIO_PATH } " ) DISPLAY_NAME = AUDIO tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D " ${ tmp_header_file } " \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ AUDIO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Describe this audio clip"}, {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": \ No newline at end of file diff --git a/docstore/7f87331d-484e-4493-bec9-41860f54cf4e b/docstore/7f87331d-484e-4493-bec9-41860f54cf4e new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/7f87331d-484e-4493-bec9-41860f54cf4e @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/7f8aec97-959d-4f37-ac4e-b4b97f5ff932 b/docstore/7f8aec97-959d-4f37-ac4e-b4b97f5ff932 new file mode 100644 index 0000000000000000000000000000000000000000..0b9f6d133a3c8fd1d2ce9eec58555814c092074d --- /dev/null +++ b/docstore/7f8aec97-959d-4f37-ac4e-b4b97f5ff932 @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key= $GOOGLE_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"inline_data": {"mime_type": "application/pdf", "data": "' " $ENCODED_PDF " '"}}, {"text": "' $PROMPT '"} ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json # Clean up the downloaded PDF rm " ${ DISPLAY_NAME } .pdf" You can also read a PDF from a local file for processing: Python from google import genai from google.genai import types import pathlib client = genai . Client () # Retrieve and encode the PDF byte filepath = pathlib . Path ( 'file.pdf' ) prompt = "Summarize this document" response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ types . Part . from_bytes ( data = filepath . read_bytes (), mime_type = 'application/pdf' , ), prompt ]) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from 'fs' ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); async function main () { const contents = [ { text : "Summarize this document" }, { inlineData : { mimeType : 'application/pdf' , data : Buffer . from ( fs . readFileSync ( "content/343019_3_art_0_py4t4l_convrt.pdf" )). toString ( "base64" ) } } ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , _ := genai . NewClient ( ctx , & genai . ClientConfig { APIKey : os . Getenv ( "GEMINI_API_KEY" ), Backend : genai . BackendGeminiAPI , }) pdfBytes , _ := os . ReadFile ( "path/to/your/file.pdf" ) parts := [] * genai . Part { & genai . Part { InlineData : & genai . Blob { MIMEType : "application/pdf" , Data : pdfBytes , }, }, genai . NewPartFromText \ No newline at end of file diff --git a/docstore/7f9bd111-8300-4f9d-88b8-fe5d53138b15 b/docstore/7f9bd111-8300-4f9d-88b8-fe5d53138b15 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/7f9bd111-8300-4f9d-88b8-fe5d53138b15 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/7fcbc032-38f3-4c48-84c1-ad4a7767d905 b/docstore/7fcbc032-38f3-4c48-84c1-ad4a7767d905 new file mode 100644 index 0000000000000000000000000000000000000000..9670cc28c51e28ab1129c38367a73e00f33ff60c --- /dev/null +++ b/docstore/7fcbc032-38f3-4c48-84c1-ad4a7767d905 @@ -0,0 +1 @@ +meal features brown rice, roasted vegetables, and chicken teriyaki. The brown rice is a healthy and complex carbohydrate that will give you sustained energy. The roasted vegetables are a great way to get your daily dose of vitamins and minerals, and the chicken teriyaki is a delicious and protein-rich option. This meal is also very easy to prepare. Simply cook the brown rice, roast the vegetables, and cook the chicken teriyaki. Once everything is cooked, divide it into meal prep containers and store them in the refrigerator. You can then grab a container and go on busy mornings! If you're looking for a healthy and delicious meal that can be easily prepped ahead of time, this meal is a great option. It's packed with nutrients and flavor, and it's sure to keep you feeling full and satisfied. Here's to healthy and delicious meal prepping! If you are having trouble getting the output you want from prompts that use media files, there are some strategies that can help you get the results you want. The following sections provide design approaches and troubleshooting tips for improving prompts that use multimodal input. You can improve your multimodal prompts by following these best practices: Prompt design fundamentals Be specific in your instructions : Craft clear and concise instructions that leave minimal room for misinterpretation. Add a few examples to your prompt: Use realistic few-shot examples to illustrate what you want to achieve. Break it down step-by-step : Divide complex tasks into manageable sub-goals, guiding the model through the process. Specify the output format : In your prompt, ask for the output to be in the format you want, like markdown, JSON, HTML and more. Put your image first for single-image prompts : While Gemini can handle image and text inputs in any order, for prompts containing a single image, it might perform better if that image (or video) is placed before the text prompt. However, for prompts that require images to be highly interleaved \ No newline at end of file diff --git a/docstore/7fd8854d-19b1-4e7a-9d9b-fb3ccdaa68ae b/docstore/7fd8854d-19b1-4e7a-9d9b-fb3ccdaa68ae new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/7fd8854d-19b1-4e7a-9d9b-fb3ccdaa68ae @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/7fdc8a53-b406-4c36-84d0-f256554a89ea b/docstore/7fdc8a53-b406-4c36-84d0-f256554a89ea new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/7fdc8a53-b406-4c36-84d0-f256554a89ea @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/7ffc4902-a535-415e-867c-250864f44bda b/docstore/7ffc4902-a535-415e-867c-250864f44bda new file mode 100644 index 0000000000000000000000000000000000000000..83e8a7f39a569661ceb51609e03cd9ce9f516cda --- /dev/null +++ b/docstore/7ffc4902-a535-415e-867c-250864f44bda @@ -0,0 +1 @@ +will only show up for projects that meet next tier qualifications . After a quick validation, the project will be upgraded to the next tier. Request a rate limit increase Each model variation has an associated rate limit (requests per minute, RPM). For details on those rate limits, see Gemini models . Request paid tier rate limit increase We offer no guarantees about increasing your rate limit, but we'll do our best to review your request and reach out to you if we're able to accommodate your capacity needs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/docstore/7ffe3d70-09db-49e1-bf18-4ae91fe7fb00 b/docstore/7ffe3d70-09db-49e1-bf18-4ae91fe7fb00 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/7ffe3d70-09db-49e1-bf18-4ae91fe7fb00 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/80273d62-9f2b-408c-83cd-6ac5499335f5 b/docstore/80273d62-9f2b-408c-83cd-6ac5499335f5 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/80273d62-9f2b-408c-83cd-6ac5499335f5 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/80355aa4-e7eb-4700-83d7-5cc8e458228f b/docstore/80355aa4-e7eb-4700-83d7-5cc8e458228f new file mode 100644 index 0000000000000000000000000000000000000000..a6b24f2239a055f1d65a9fc8ecf3b6c3213634db --- /dev/null +++ b/docstore/80355aa4-e7eb-4700-83d7-5cc8e458228f @@ -0,0 +1 @@ +min | 12/2/2023 | 12/4/2023 | | Physics | In progress | 2 hrs | 12/3/2023 | 12/5/2023 | | Biology | Not started | 6 hrs | 11/2/2016 | 12/6/2023 | Prompt Model response Provide a list of all the following attributes: ingredients, type of cuisine, vegetarian or not, in JSON format ```json { "ingredients": [ "rice", "shrimp", "clams", "mussels", "peas", "tomatoes", "onions", "garlic", "olive oil", "paprika", "salt", "pepper" ], "type of cuisine": "Spanish", "vegetarian": false } ``` Put your image first for single-image prompts While Gemini can interpret image and text in any order within a prompt, placing a single image before the text prompt might lead to better results. In the following samples, you'll notice the image comes first before the text in each prompt. Troubleshooting your multimodal prompt You might need to troubleshoot your prompt if you are not getting a helpful response. Here are a few strategies you could try. If the model is not drawing information from the relevant part of the image To get a more specific response, you can point out which aspects of the image you want the prompt to draw information from to inform its response. Prompt Model response How many days will these diapers last a baby? The diapers will last for 198 days before they run out. Updated prompt Improved response How long will these diapers last before I run out? Use the weight shown on the box to determine the child's age, and use the total number of diapers in the box. Divide the total number by how many diapers the child goes through per day. The box contains 198 diapers. The box states the diapers are for a child weighing 22-37 lbs. The average child in this weight range is likely between 1 and 2 years old. A 2-year-old typically goes through 6 diapers per day, therefore, the diapers will last around 33 days (198/6). If the model output is too generic and not tailored enough to the image input To help the model tailor its response to the image(s), try asking it to describe the \ No newline at end of file diff --git a/docstore/80458cb3-4c13-4b5f-9565-2e114f2cebb7 b/docstore/80458cb3-4c13-4b5f-9565-2e114f2cebb7 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/80458cb3-4c13-4b5f-9565-2e114f2cebb7 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/80489ba3-36e6-4da3-ad84-b2db428b2df0 b/docstore/80489ba3-36e6-4da3-ad84-b2db428b2df0 new file mode 100644 index 0000000000000000000000000000000000000000..45c046a450410d0d7cea0863f584c81b40ede6bc --- /dev/null +++ b/docstore/80489ba3-36e6-4da3-ad84-b2db428b2df0 @@ -0,0 +1 @@ +Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' , system_instruction = 'you are a story teller for kids under 5 years old' , generation_config = genai . GenerationConfig ( max_output_tokens = 400 , top_k = 2 , top_p = 0.5 , temperature = 0.5 , response_mime_type = 'application/json' , stop_sequences = [ ' \n ' ], ) ) response = model . generate_content ( 'tell me a story in 100 words' ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" , generationConfig : { candidateCount : 1 , stopSequences : [ "x" ], maxOutputTokens : 20 , temperature : 1.0 , }, }); const result = await model . generateContent ( "Tell me a story about a magic backpack." , ); console . log ( result . response . text ()) Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( "gemini-1.5-flash" ) model . SetTemperature ( 0.5 ) model . SetTopP ( 0.5 ) model . SetTopK ( 2.0 ) model . SetMaxOutputTokens ( 100 ) model . ResponseMIMEType = "application/json" resp , err := model . GenerateContent ( ctx , genai . Text ( "Tell me about New York" )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) // utility for printing response After Python For all methods in the new SDK, the required arguments are provided as keyword arguments. All optional inputs are provided in the config argument. Config arguments can be specified as either Python dictionaries or Config classes in the google.genai.types namespace. For utility and uniformity, all definitions within the types module are pydantic classes. from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = \ No newline at end of file diff --git a/docstore/805b102f-bc21-4389-9f00-a347dbb9eeb0 b/docstore/805b102f-bc21-4389-9f00-a347dbb9eeb0 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/805b102f-bc21-4389-9f00-a347dbb9eeb0 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/805de3b2-5320-41e5-b8eb-7895c1eb0889 b/docstore/805de3b2-5320-41e5-b8eb-7895c1eb0889 new file mode 100644 index 0000000000000000000000000000000000000000..1540f812946831d7975c774be202eaa83cf52504 --- /dev/null +++ b/docstore/805de3b2-5320-41e5-b8eb-7895c1eb0889 @@ -0,0 +1 @@ +"CiwBVKhc7nRyTi3HmggPD9iQiRc261f5jwuMdw3H/itDH0emsb9ZVo3Nwx9p6wpsAVSoXO5i8fDV4jBSBLoaWxB5zUdlGY6aIGp+I0oEnwRRSRQ1LOvrDlojEH8JE8HjiKXALdJrvNPiG+HY3GZEO8pZjEZtc3UoBUh7+SVyjK7Xolu7aRYYeUyzrCapoETWypER1jbrJXnFV23hCosBAVSoXO6oIPNJSmbuEDfGafOhuCSHkpr1yjTp35RXYqmCESzRzWf5+nFXLqncqeFo4ohoxbiYQVpVQbOZF81p8o9zg6xeRE7qMeOv+XN7enXGJ4/s3qNFQpfkSMqRdBITN1VpX7jyfEAjvxBNc7PDfDJZmEPY338ZIY5nFFcmzJSWjVrboFt2sMFv+A==" } ] , "role" : "model" } , "finishReason" : "STOP" , "index" : 0 } ] , # Remainder of response... You can confirm that you received a signature and see what a signature looks like using the following code: # Step 2: Call the model with function declarations # ...Generation config, Configure the client, and Define user prompt (No changes) # Send request with declarations (using a thinking model) response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents ) # See thought signatures for part in response . candidates [ 0 ] . content . parts : if part . thought_signature : print ( "Thought signature:" ) print ( part . thought_signature ) Returning signatures back to the server In order to return signatures back: You should return signatures along with their containing parts back to the server You shouldn't merge a part with a signature with another part which also contains a signature. The signature string is not concatenable You shouldn't merge one part with a signature with another part without a signature. This breaks the correct positioning of the thought represented by the signature. The code will remain the same as in Step 4 of the previous section. But in this case (as indicated in the comment below) you will return signatures to the model along with the result of the function execution so the model can incorporate the thoughts into its final response: Python # Step 4: Create user friendly response with function result and call the model again # ...Create a function response part (No change) # Append thought \ No newline at end of file diff --git a/docstore/807c1576-dd1a-46c1-a31e-6373f6a5b1de b/docstore/807c1576-dd1a-46c1-a31e-6373f6a5b1de new file mode 100644 index 0000000000000000000000000000000000000000..203c1e44cd8be8f0144d13a11f43fb822930ab15 --- /dev/null +++ b/docstore/807c1576-dd1a-46c1-a31e-6373f6a5b1de @@ -0,0 +1 @@ +] // MCP Server }); const client = new Client ( { name : "example-client" , version : "1.0.0" } ); // Configure the client const ai = new GoogleGenAI ({}); // Initialize the connection between client and server await client . connect ( serverParams ); // Send request to the model with MCP tools const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : `What is the weather in London in ${ new Date (). toLocaleDateString () } ?` , config : { tools : [ mcpToTool ( client )], // uses the session, will automatically call the tool // Uncomment if you **don't** want the sdk to automatically call the tool // automaticFunctionCalling: { // disable: true, // }, }, }); console . log ( response . text ) // Close the connection await client . close (); Limitations with built-in MCP support Built-in MCP support is a experimental feature in our SDKs and has the following limitations: Only tools are supported, not resources nor prompts It is available for the Python and JavaScript/TypeScript SDK. Breaking changes might occur in future releases. Manual integration of MCP servers is always an option if these limit what you're building. Supported models This section lists models and their function calling capabilities. Experimental models are not included. You can find a comprehensive capabilities overview on the model overview page. Model Function Calling Parallel Function Calling Compositional Function Calling Gemini 2.5 Pro ✔️ ✔️ ✔️ Gemini 2.5 Flash ✔️ ✔️ ✔️ Gemini 2.5 Flash-Lite ✔️ ✔️ ✔️ Gemini 2.0 Flash ✔️ ✔️ ✔️ Gemini 2.0 Flash-Lite X X X Best practices Function and Parameter Descriptions: Be extremely clear and specific in your descriptions. The model relies on these to choose the correct function and provide appropriate arguments. Naming: Use descriptive function names (without spaces, periods, or dashes). Strong Typing: Use specific types (integer, string, enum) for parameters to reduce errors. If a parameter has a limited set of valid \ No newline at end of file diff --git a/docstore/8080aea7-e7c6-47dc-8239-57dfd61d64d0 b/docstore/8080aea7-e7c6-47dc-8239-57dfd61d64d0 new file mode 100644 index 0000000000000000000000000000000000000000..2433d9b61cb36215b85b944fa4e917b67d7df080 --- /dev/null +++ b/docstore/8080aea7-e7c6-47dc-8239-57dfd61d64d0 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/api-key Title: Using Gemini API keys | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/80bb98e7-4fd5-47d7-9ddc-1fe563b4b35f b/docstore/80bb98e7-4fd5-47d7-9ddc-1fe563b4b35f new file mode 100644 index 0000000000000000000000000000000000000000..45d17d6ac3f5b7951c085e76c40e76fbe5fe62ea --- /dev/null +++ b/docstore/80bb98e7-4fd5-47d7-9ddc-1fe563b4b35f @@ -0,0 +1 @@ +"thinkingConfig": { "thinkingBudget": 1024 # Thinking off: # "thinkingBudget": 0 # Turn on dynamic thinking: # "thinkingBudget": -1 } } }' Thought summaries Thought summaries are synthesized versions of the model's raw thoughts and offer insights into the model's internal reasoning process. Note that thinking budgets apply to the model's raw thoughts and not to thought summaries. You can enable thought summaries by setting includeThoughts to true in your request configuration. You can then access the summary by iterating through the response parameter's parts , and checking the thought boolean. Here's an example demonstrating how to enable and retrieve thought summaries without streaming, which returns a single, final thought summary with the response: Python from google import genai from google.genai import types client = genai . Client () prompt = "What is the sum of the first 50 prime numbers?" response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = prompt , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( include_thoughts = True ) ) ) for part in response . candidates [ 0 ] . content . parts : if not part . text : continue if part . thought : print ( "Thought summary:" ) print ( part . text ) print () else : print ( "Answer:" ) print ( part . text ) print () JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : "What is the sum of the first 50 prime numbers?" , config : { thinkingConfig : { includeThoughts : true , }, }, }); for ( const part of response . candidates [ 0 ]. content . parts ) { if ( ! part . text ) { continue ; } else if ( part . thought ) { console . log ( "Thoughts summary:" ); console . log ( part . text ); } else { console . log ( "Answer:" ); console . log ( part . text ); } } } main (); Go package main import ( "context" \ No newline at end of file diff --git a/docstore/80c03f53-6590-4be7-9827-6be22cc9a5d9 b/docstore/80c03f53-6590-4be7-9827-6be22cc9a5d9 new file mode 100644 index 0000000000000000000000000000000000000000..c72243f466a39b8c76a4237da0daa8b99ecbbe13 --- /dev/null +++ b/docstore/80c03f53-6590-4be7-9827-6be22cc9a5d9 @@ -0,0 +1 @@ +}, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST # Use a temporary file to hold the base64 encoded image data TEMP_B64 = $( mktemp ) trap 'rm -f "$TEMP_B64"' EXIT base64 $B64FLAGS $IMG_PATH > " $TEMP_B64 " # Use a temporary file to hold the JSON payload TEMP_JSON = $( mktemp ) trap 'rm -f "$TEMP_JSON"' EXIT cat > " $TEMP_JSON " << EOF { "contents" : [ { "parts" : [ { "text" : "Tell me about this instrument" } , { "inline_data" : { "mime_type" : "image/jpeg" , "data" : " $( cat " $TEMP_B64 " ) " } } ] } ] } EOF curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d "@ $TEMP_JSON " Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const imageUrl = 'http://image/url' ; const image = getImageData ( imageUrl ); const payload = { contents : [ { parts : [ { image }, { text : 'Tell me about this instrument' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } function getImageData ( url ) { const blob = UrlFetchApp . fetch ( url ). getBlob (); return { mimeType : blob . getContentType (), data : Utilities . base64Encode ( blob . getBytes ()) }; } \ No newline at end of file diff --git a/docstore/80e3b10a-9474-4059-98df-b213850cf79b b/docstore/80e3b10a-9474-4059-98df-b213850cf79b new file mode 100644 index 0000000000000000000000000000000000000000..d028e190ded4732dff341fe647fe7532c210272b --- /dev/null +++ b/docstore/80e3b10a-9474-4059-98df-b213850cf79b @@ -0,0 +1 @@ +second image shows an empty hallway in a shopping mall, which is a place where people often go to buy things for their trips. The third image shows a stack of toilet paper, which is something that people often use when they are traveling. The fourth image shows a dark and stormy sea, which is something that people often encounter when they are traveling. The same tips work for creative output from images. In the following example with the rabbit, explicitly asking the model to include aspects from the image can improve the output. Prompt Model response Can you write me a poem about this image? Little bunny, so soft and brown, Hopping through the field, up and down. Your ears so long, your tail so small, You're the cutest creature of them all. You nibble on the grass, so green and sweet, And hop away when you see me. But I'll always be your friend, Little bunny, until the very end. Updated prompt Improved response Can you write me a descriptive and dramatic poem about this image and include the location? Little bunny on the green, Ears so long, so keen. What do you see out there, In the distance, where you stare? Is it the ocean blue, Sparkling in the sun's hue? Or the waves that crash and roar, A rhythm that you adore? Little bunny, tell me true, What captures your attention so? Is it the salty breeze, Or the seagulls' cries that please? Perhaps it's the vastness of it all, A world so big and small. A place where dreams can roam, And adventures can be sown. Little bunny, I wonder what you dream, As you sit on the grass, so serene. Do you yearn to explore the deep, Or stay on land, where you can leap? Whatever it may be, little bunny, Keep that spark of wonder burning brightly. For in your dreams and aspirations, Lies a world waiting for your creation. Troubleshooting which part of the prompt failed It can be hard to know whether a prompt failed because the model didn't understand the image to begin with, or if it did understand the image but did not perform the \ No newline at end of file diff --git a/docstore/810652ce-2799-45ff-b2ee-ea03d6d6ad1a b/docstore/810652ce-2799-45ff-b2ee-ea03d6d6ad1a new file mode 100644 index 0000000000000000000000000000000000000000..72fc1af4a198f3f6ddd1bd2823f7a9c0e142bbbb --- /dev/null +++ b/docstore/810652ce-2799-45ff-b2ee-ea03d6d6ad1a @@ -0,0 +1 @@ +should be sent to flush any cached audio. The client can resume sending audio data at any time. Python # example audio file to try: # URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" # !wget -q $URL -O sample.pcm import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_bytes = Path ( "sample.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) # if stream gets paused, send: # await session.send_realtime_input(audio_stream_end=True) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // example audio file to try: // URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" // !wget -q $URL -O sample.pcm import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . \ No newline at end of file diff --git a/docstore/810ac2d5-72dd-4c2e-8af2-e6b6200664bb b/docstore/810ac2d5-72dd-4c2e-8af2-e6b6200664bb new file mode 100644 index 0000000000000000000000000000000000000000..c6aa65d262d73786cc7d106425f2574a0a896d12 --- /dev/null +++ b/docstore/810ac2d5-72dd-4c2e-8af2-e6b6200664bb @@ -0,0 +1 @@ +the modal, you can use the sliders to adjust the content filtering level per safety category: Note: If you set any of the category filters to Block none , Google AI Studio will display a reminder about the Gemini API's Terms of Service with respect to safety settings. When you send a request (for example, by asking the model a question), a warning No Content message appears if the request's content is blocked. To see more details, hold the pointer over the No Content text and click warning Safety . Gemini API SDKs The following code snippet shows how to set safety settings in your GenerateContent call. This sets the thresholds for the harassment ( HARM_CATEGORY_HARASSMENT ) and hate speech ( HARM_CATEGORY_HATE_SPEECH ) categories. For example, setting these categories to BLOCK_LOW_AND_ABOVE blocks any content that has a low or higher probability of being harassment or hate speech. To understand the threshold settings, see Safety filtering per request . Python from google import genai from google.genai import types import PIL.Image img = PIL . Image . open ( "cookies.jpg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ 'Do these look store-bought or homemade?' , img ], config = types . GenerateContentConfig ( safety_settings = [ types . SafetySetting ( category = types . HarmCategory . HARM_CATEGORY_HATE_SPEECH , threshold = types . HarmBlockThreshold . BLOCK_LOW_AND_ABOVE , ), ] ) ) print ( response . text ) Go package main import ( "context" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { SafetySettings : [] * genai . SafetySetting { { Category : "HARM_CATEGORY_HATE_SPEECH" , Threshold : "BLOCK_LOW_AND_ABOVE" , }, }, } response , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , genai . Text ( "Some potentially \ No newline at end of file diff --git a/docstore/810ba5f0-7e7a-4acf-9d0d-443d364bae0d b/docstore/810ba5f0-7e7a-4acf-9d0d-443d364bae0d new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/810ba5f0-7e7a-4acf-9d0d-443d364bae0d @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/8119ff47-e865-44cb-b544-136810136434 b/docstore/8119ff47-e865-44cb-b544-136810136434 new file mode 100644 index 0000000000000000000000000000000000000000..805d10be613a3c55006267c8f5d9ce17fbca450d --- /dev/null +++ b/docstore/8119ff47-e865-44cb-b544-136810136434 @@ -0,0 +1 @@ +declaration can include the following parameters: name (string): A unique name for the function ( get_weather_forecast , send_email ). Use descriptive names without spaces or special characters (use underscores or camelCase). description (string): A clear and detailed explanation of the function's purpose and capabilities. This is crucial for the model to understand when to use the function. Be specific and provide examples if helpful ("Finds theaters based on location and optionally movie title which is currently playing in theaters."). parameters (object): Defines the input parameters the function expects. type (string): Specifies the overall data type, such as object . properties (object): Lists individual parameters, each with: type (string): The data type of the parameter, such as string , integer , boolean, array . description (string): A description of the parameter's purpose and format. Provide examples and constraints ("The city and state, e.g., 'San Francisco, CA' or a zip code e.g., '95616'."). enum (array, optional): If the parameter values are from a fixed set, use "enum" to list the allowed values instead of just describing them in the description. This improves accuracy ("enum": ["daylight", "cool", "warm"]). required (array): An array of strings listing the parameter names that are mandatory for the function to operate. Function calling with thinking Enabling "thinking" can improve function call performance by allowing the model to reason through a request before suggesting function calls. However, because the Gemini API is stateless, this reasoning context is lost between turns, which can reduce the quality of function calls as they require multiple turn requests. To preserve this context you can use thought signatures. A thought signature is an encrypted representation of the model's internal thought process that you pass back to the model on subsequent turns. To use thought signatures: Receive the signature: When thinking is enabled, the API \ No newline at end of file diff --git a/docstore/81248586-5741-486f-a255-cfe4ebe60637 b/docstore/81248586-5741-486f-a255-cfe4ebe60637 new file mode 100644 index 0000000000000000000000000000000000000000..61c5e503920dcb87c18a5f61af1ef61e3cf960fa --- /dev/null +++ b/docstore/81248586-5741-486f-a255-cfe4ebe60637 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/openai#extra-body Title: OpenAI compatibility | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/81266156-c0cb-4b0c-9a1f-b0807f7530c0 b/docstore/81266156-c0cb-4b0c-9a1f-b0807f7530c0 new file mode 100644 index 0000000000000000000000000000000000000000..2bc9ee1b64943d2fc9ee4b66d281a35e0e278a02 --- /dev/null +++ b/docstore/81266156-c0cb-4b0c-9a1f-b0807f7530c0 @@ -0,0 +1 @@ +Session management with Live API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Session management with Live API In the Live API, a session refers to a persistent connection where input and output are streamed continuously over the same connection (read more about how it works ). This unique session design enables low latency and supports unique features, but can also introduce challenges, like session time limits, and early termination. This guide covers strategies for overcoming the session management challenges that can arise when using the Live API. Session lifetime Without compression, audio-only sessions are limited to 15 minutes, and audio-video sessions are limited to 2 minutes. Exceeding these limits will terminate the session (and therefore, the connection), but you can use context window compression to extend sessions to an unlimited amount of time. The lifetime of a connection is limited as well, to around 10 minutes. When the connection terminates, the session terminates as well. In this case, you can configure a single session to stay active over multiple connections using session resumption . You'll also receive a GoAway message before the connection ends, allowing you to take further actions. Context window compression To enable longer sessions, and avoid abrupt connection termination, you can enable context window compression by setting the contextWindowCompression field as part of the session configuration. In the ContextWindowCompressionConfig , you can configure a sliding-window mechanism and the number of tokens that triggers compression. Python from google.genai import types config = types . LiveConnectConfig ( \ No newline at end of file diff --git a/docstore/81322f2c-2ac0-4b8c-aacc-3a8e5aa60cd8 b/docstore/81322f2c-2ac0-4b8c-aacc-3a8e5aa60cd8 new file mode 100644 index 0000000000000000000000000000000000000000..ce35346b712902c0bb2520a3719cbf944185a4fe --- /dev/null +++ b/docstore/81322f2c-2ac0-4b8c-aacc-3a8e5aa60cd8 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/vision#upload-image Title: Image understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/81431061-0580-43f9-a41f-f181b715e2ed b/docstore/81431061-0580-43f9-a41f-f181b715e2ed new file mode 100644 index 0000000000000000000000000000000000000000..35cb4724ff25e15eeabaea84a051d2545b73055e --- /dev/null +++ b/docstore/81431061-0580-43f9-a41f-f181b715e2ed @@ -0,0 +1 @@ +model to produce concise responses, you can include examples in the prompt that give preference to concise responses. The following prompt provides two examples that show preference to the shorter explanations. In the response, you can see that the examples guided the model to choose the shorter explanation ( Explanation2 ) as opposed to the longer explanation ( Explanation1 ) like it did previously. Prompt: Below are some examples showing a question, explanation, and answer format: Question: Why is the sky blue? Explanation1: The sky appears blue because of Rayleigh scattering, which causes shorter blue wavelengths of light to be scattered more easily than longer red wavelengths, making the sky look blue. Explanation2: Due to Rayleigh scattering effect. Answer: Explanation2 Question: What is the cause of earthquakes? Explanation1: Sudden release of energy in the Earth's crust. Explanation2: Earthquakes happen when tectonic plates suddenly slip or break apart, causing a release of energy that creates seismic waves that can shake the ground and cause damage. Answer: Explanation1 Now, Answer the following question given the example formats above: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Answer: Explanation2 (gemini-2.5-flash) Optimal number of examples Models like Gemini can often pick up on patterns using a few examples, though you may need to experiment with the number of examples to provide in the prompt for the best results. At the same time, if you include too many examples, the model may start to overfit the response to the examples. Patterns vs anti patterns Using examples to show the model a pattern to follow is more effective than using examples to show the model an anti pattern \ No newline at end of file diff --git a/docstore/815d2c1f-072d-4e8c-ac0c-3e2fb89a5448 b/docstore/815d2c1f-072d-4e8c-ac0c-3e2fb89a5448 new file mode 100644 index 0000000000000000000000000000000000000000..cc967ffee72bfb803a27c22b45a7304a203928ac --- /dev/null +++ b/docstore/815d2c1f-072d-4e8c-ac0c-3e2fb89a5448 @@ -0,0 +1 @@ +the model to parse the problem before answering the prompt. Prompt Response Parse the formula in the image first. Then based on the formula, answer "what's the 4th term in the sequence?". Click to expand the result $$ b(n) = b(n-1) * (-3) $$ $$ b(1) = 15 $$ Substitute n = 4 into the formula: $$ b(4) = b(4-1) * (-3) $$ $$ b(4) = b(3) * (-3) $$ We don't know the value of b(3), so substitute n = 3 into the formula: $$ b(3) = b(3-1) * (-3) $$ $$ b(3) = b(2) * (-3) $$ We don't know the value of b(2), so substitute n = 2 into the formula: $$ b(2) = b(2-1) * (-3) $$ $$ b(2) = b(1) * (-3) $$ We know the value of b(1) is 15, so substitute it into the formula: $$ b(2) = 15 * (-3) $$ $$ b(2) = -45 $$ Now that we know the value of b(2), substitute it into the formula for b(3): $$ b(3) = b(2) * (-3) $$ $$ b(3) = -45 * (-3) $$ $$ b(3) = 135 $$ Now that we know the value of b(3), substitute it into the formula for b(4): $$ b(4) = b(3) * (-3) $$ $$ b(4) = 135 * (-3) $$ $$ b(4) = -405 $$ Therefore, the 4th term in the sequence is -405. This model response is correct - now to make the response even more streamlined, you can ask the model to to think step-by-step. Updated prompt Improved response What is the 4th term in the sequence? Think step by step. The first term is 15. The second term is b(1) * (-3) = 15 * (-3) = -45. The third term is b(2) * (-3) = -45 * (-3) = 135. The fourth term is b(3) * (-3) = 135 * (-3) = -405. Specify the output format A common problem is the need for model output to be in a certain format (e.g. markdown, JSON, HTML), especially if the model output needs to be ingested by a downstream task. You can try to produce output in that format by instructing the model to do so within the prompt. Below are two examples: Prompt Model response Parse the table in this image into markdown format | Subject | Status | Time | Start date | Due on | | ------- | ------ | ---- | ---------- | ------ | | Math | Done | 1.5 hrs | 11/29/2023 | 12/1/2023 | | Geography | Done | 30 \ No newline at end of file diff --git a/docstore/817b962c-d5c3-4d38-a492-e775e2b560ff b/docstore/817b962c-d5c3-4d38-a492-e775e2b560ff new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/817b962c-d5c3-4d38-a492-e775e2b560ff @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/8183696b-8ef7-4bea-bbb4-1dc89f0acba0 b/docstore/8183696b-8ef7-4bea-bbb4-1dc89f0acba0 new file mode 100644 index 0000000000000000000000000000000000000000..6ba79c2c7031e707d89e2b23470a5d9ee375cc5c --- /dev/null +++ b/docstore/8183696b-8ef7-4bea-bbb4-1dc89f0acba0 @@ -0,0 +1 @@ +mcp.client.stdio import stdio_client from google import genai client = genai . Client () # Create server parameters for stdio connection server_params = StdioServerParameters ( command = "npx" , # Executable args = [ "-y" , "@philschmid/weather-mcp" ], # MCP Server env = None , # Optional environment variables ) async def run (): async with stdio_client ( server_params ) as ( read , write ): async with ClientSession ( read , write ) as session : # Prompt to get the weather for the current day in London. prompt = f "What is the weather in London in { datetime . now () . strftime ( '%Y-%m- %d ' ) } ?" # Initialize the connection between client and server await session . initialize () # Send request to the model with MCP function declarations response = await client . aio . models . generate_content ( model = "gemini-2.5-flash" , contents = prompt , config = genai . types . GenerateContentConfig ( temperature = 0 , tools = [ session ], # uses the session, will automatically call the tool # Uncomment if you **don't** want the SDK to automatically call the tool # automatic_function_calling=genai.types.AutomaticFunctionCallingConfig( # disable=True # ), ), ) print ( response . text ) # Start the asyncio event loop and run the main function asyncio . run ( run ()) JavaScript Make sure the latest version of the mcp SDK is installed on your platform of choice. npm install @modelcontextprotocol/sdk Note: JavaScript supports automatic tool calling by wrapping the client with mcpToTool . If you want to disable it, you can provide automaticFunctionCalling with disabled true . import { GoogleGenAI , FunctionCallingConfigMode , mcpToTool } from '@google/genai' ; import { Client } from "@modelcontextprotocol/sdk/client/index.js" ; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js" ; // Create server parameters for stdio connection const serverParams = new StdioClientTransport ({ command : "npx" , // Executable args : [ "-y" , "@philschmid/weather-mcp" \ No newline at end of file diff --git a/docstore/8183811b-ee09-4c32-a8fe-d4ac6f1fdc44 b/docstore/8183811b-ee09-4c32-a8fe-d4ac6f1fdc44 new file mode 100644 index 0000000000000000000000000000000000000000..5ec9ece3755d9588328841497099189b5d8c6b8d --- /dev/null +++ b/docstore/8183811b-ee09-4c32-a8fe-d4ac6f1fdc44 @@ -0,0 +1 @@ +5. Forever Blooms Task Give me a simple list of just the things that I must bring on a camping trip. The list should have 5 items. 1. **Tent:** Your shelter from the elements. 2. **Sleeping Bag:** Stay warm and comfortable. 3. **Sleeping Pad:** Insulation and cushioning for a good night's sleep. 4. **Headlamp or Flashlight:** Light for nighttime activities. 5. **First Aid Kit:** Be prepared for minor injuries. Entity Classify the following items as [large, small]: Elephant Mouse Snail Here's the classification: * **Elephant:** Large * **Mouse:** Small * **Snail:** Small Partial input completion Generative language models work like an advanced auto completion tool. When you provide partial content, the model can provide the rest of the content or what it thinks is a continuation of that content as a response. When doing so, if you include any examples or context, the model can take those examples or context into account. The following example provides a prompt with an instruction and an entity input: Prompt: For the given order, return a JSON object that has the fields cheeseburger, hamburger, fries, or drink, with the value being the quantity. Order: A burger and a drink. Response: { "cheeseburger": 0, "hamburger": 1, "fries": 0, "drink": 1 } (gemini-2.5-flash) While the model did as prompted, writing out the instructions in natural language can sometimes be challenging and it leaves a lot to the model's interpretation. For example, a restaurants menu might contain many items. To reduce the size of the JSON response, you probably want to omit the items that weren't ordered. In this case, you can give an example and a response prefix and let the model complete it: Prompt: Valid fields are cheeseburger, hamburger, fries, and drink. Order: Give me a cheeseburger and fries Output: ``` { "cheeseburger": 1, "fries": 1 } ``` Order: I want two burgers, a drink, and fries. Output: Response: ``` { "hamburger": 2, "drink": 1, "fries": 1 } ``` (gemini-2.5-flash) Notice how \ No newline at end of file diff --git a/docstore/81993a88-b11d-407f-a7a6-d17b4d354709 b/docstore/81993a88-b11d-407f-a7a6-d17b4d354709 new file mode 100644 index 0000000000000000000000000000000000000000..ba28dbc7ce7bec083cb930373e6fd594666aa933 --- /dev/null +++ b/docstore/81993a88-b11d-407f-a7a6-d17b4d354709 @@ -0,0 +1 @@ +my house?" , }); console . log ( "Chat response 2:" , response2 . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) res , _ := chat . SendMessage ( ctx , genai . Part { Text : "How many paws are in my house?" }) if len ( res . Candidates ) > 0 { fmt . Println ( res . Candidates [ 0 ]. Content . Parts [ 0 ]. Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' \ No newline at end of file diff --git a/docstore/81a8fe63-8316-429f-b315-6cd806dbbf2a b/docstore/81a8fe63-8316-429f-b315-6cd806dbbf2a new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/81a8fe63-8316-429f-b315-6cd806dbbf2a @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/81c55545-b2a1-46c0-acab-b95bf547d7f2 b/docstore/81c55545-b2a1-46c0-acab-b95bf547d7f2 new file mode 100644 index 0000000000000000000000000000000000000000..bf98246a4d5f20dab4e649ac0598b2bfac1851f5 --- /dev/null +++ b/docstore/81c55545-b2a1-46c0-acab-b95bf547d7f2 @@ -0,0 +1 @@ +" ) REST tmp_batch_input_file = batch_input.tmp echo -e '{"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}], "generationConfig": {"temperature": 0.7}}\n{"contents": [{"parts": [{"text": "What are the main ingredients in a Margherita pizza?"}]}]}' > batch_input.tmp MIME_TYPE = $( file -b --mime-type " ${ tmp_batch_input_file } " ) NUM_BYTES = $( wc -c < " ${ tmp_batch_input_file } " ) DISPLAY_NAME = BatchInput tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files \ -D " ${ tmp_header_file } " \ -H " x-goog-api-key: $GEMINI_API_KEY " \ -H " X-Goog-Upload-Protocol: resumable " \ -H " X-Goog-Upload-Command: start " \ -H " X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H " X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H " Content-Type: application/jsonl " \ -d " { 'file' : { 'display_name' : '${DISPLAY_NAME}' }} " 2> /dev/null upload_url= $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H " Content-Length: ${ NUM_BYTES } " \ -H " X-Goog-Upload-Offset: 0 " \ -H " X-Goog-Upload-Command: upload, finalize " \ --data-binary " @ ${ tmp_batch_input_file } " 2> /dev/null > file_info.json file_uri= $( jq ".file.uri" file_info.json ) The following example calls the BatchGenerateContent method with the input file uploaded using File API: Python # Assumes `uploaded_file` is the file object from the previous step file_batch_job = client . batches . create ( model = "gemini-2.5-flash" , src = uploaded_file . name , config = { 'display_name' : "file-upload-job-1" , }, ) print ( f "Created batch job: { file_batch_job . name } " ) REST BATCH_INPUT_FILE = 'files/123456' # File ID curl \ No newline at end of file diff --git a/docstore/81e64a88-1320-4fd1-9abe-35b2a44e9315 b/docstore/81e64a88-1320-4fd1-9abe-35b2a44e9315 new file mode 100644 index 0000000000000000000000000000000000000000..0108c51f9cfafd29a150b14ecbeca25bcacc2d82 --- /dev/null +++ b/docstore/81e64a88-1320-4fd1-9abe-35b2a44e9315 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/libraries#install Title: Gemini API libraries | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/8201095d-4dfa-457e-a51a-387619b1098e b/docstore/8201095d-4dfa-457e-a51a-387619b1098e new file mode 100644 index 0000000000000000000000000000000000000000..198ea911571bf7f5fba76b46d338956e37a5f152 --- /dev/null +++ b/docstore/8201095d-4dfa-457e-a51a-387619b1098e @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/820d6323-2625-4b17-8aaa-98ba4705fa77 b/docstore/820d6323-2625-4b17-8aaa-98ba4705fa77 new file mode 100644 index 0000000000000000000000000000000000000000..e652ebcdf342b29a27305f6af4427b0dbb03d3f1 --- /dev/null +++ b/docstore/820d6323-2625-4b17-8aaa-98ba4705fa77 @@ -0,0 +1 @@ +"gemini-2.5-flash" , contents = "What's the temperature in London?" , config = config , ) # Check for a function call if response . candidates [ 0 ] . content . parts [ 0 ] . function_call : function_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call print ( f "Function to call: { function_call . name } " ) print ( f "Arguments: { function_call . args } " ) # In a real app, you would call your function here: # result = get_current_temperature(**function_call.args) else : print ( "No function call found in the response." ) print ( response . text ) JavaScript import { GoogleGenAI , Type } from '@google/genai' ; // Configure the client const ai = new GoogleGenAI ({}); // Define the function declaration for the model const weatherFunctionDeclaration = { name : 'get_current_temperature' , description : 'Gets the current temperature for a given location.' , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , description : 'The city name, e.g. San Francisco' , }, }, required : [ 'location' ], }, }; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : "What's the temperature in London?" , config : { tools : [{ functionDeclarations : [ weatherFunctionDeclaration ] }], }, }); // Check for function calls in the response if ( response . functionCalls && response . functionCalls . length > 0 ) { const functionCall = response . functionCalls [ 0 ]; // Assuming one function call console . log ( `Function to call: ${ functionCall . name } ` ); console . log ( `Arguments: ${ JSON . stringify ( functionCall . args ) } ` ); // In a real app, you would call your actual function here: // const result = await getCurrentTemperature(functionCall.args); } else { console . log ( "No function call found in the response." ); console . log ( response . text ); } REST curl \ No newline at end of file diff --git a/docstore/8216af6b-b747-4d57-b725-ee11f2ac5e73 b/docstore/8216af6b-b747-4d57-b725-ee11f2ac5e73 new file mode 100644 index 0000000000000000000000000000000000000000..e8472db3f1b37b3dc2d13dd06406d0e42fc75dfb --- /dev/null +++ b/docstore/8216af6b-b747-4d57-b725-ee11f2ac5e73 @@ -0,0 +1 @@ +costing 258 tokens. Tips and best practices Verify that images are correctly rotated. Use clear, non-blurry images. When using a single image with text, place the text prompt after the image part in the contents array. What's next This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources: Files API : Learn more about uploading and managing files for use with Gemini. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/822b2fb3-dc40-4831-8334-f76f49250390 b/docstore/822b2fb3-dc40-4831-8334-f76f49250390 new file mode 100644 index 0000000000000000000000000000000000000000..bbc4019685cdf16085ca79e5df30b3ebeb71657f --- /dev/null +++ b/docstore/822b2fb3-dc40-4831-8334-f76f49250390 @@ -0,0 +1 @@ +"role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . server_content . model_turn : print ( "Model turn:" , response . server_content . model_turn ) if response . server_content . output_transcription : print ( "Transcript:" , response . server_content . output_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ], outputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . debug ( 'Received output transcription: %s\n' , turn . serverContent . outputTranscription . text ); } } session . close (); } async function \ No newline at end of file diff --git a/docstore/82469696-8578-4fc2-aa3b-2ce167f42963 b/docstore/82469696-8578-4fc2-aa3b-2ce167f42963 new file mode 100644 index 0000000000000000000000000000000000000000..a8da0e92acb39cd3b0a95b76d425835072520bc1 --- /dev/null +++ b/docstore/82469696-8578-4fc2-aa3b-2ce167f42963 @@ -0,0 +1 @@ +Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/docstore/825ea7f0-9df4-4708-8d20-163e16f1ed05 b/docstore/825ea7f0-9df4-4708-8d20-163e16f1ed05 new file mode 100644 index 0000000000000000000000000000000000000000..f71ac6c85727e3c520290c703b52e420cb1baa33 --- /dev/null +++ b/docstore/825ea7f0-9df4-4708-8d20-163e16f1ed05 @@ -0,0 +1 @@ +(JSONL) file. Each line in this file must be a JSON object containing a user-defined key and a request object, where the request is a valid GenerateContentRequest object. The user-defined key is used in the response to indicate which output is the result of which request. For example, the request with the key defined as request-1 will have its response annotated with the same key name. This file is uploaded using the File API . The maximum allowed file size for an input file is 2GB. The following is an example of a JSONL file. You can save it in a file named my-batch-requests.json : { "key" : "request-1" , "request" : { "contents" : [{ "parts" : [{ "text" : "Describe the process of photosynthesis." }]}], "generation_config" : { "temperature" : 0.7 }}} { "key" : "request-2" , "request" : { "contents" : [{ "parts" : [{ "text" : "What are the main ingredients in a Margherita pizza?" }]}]}} Similarly to inline requests, you can specify other parameters like system instructions, tools or other configurations in each request JSON. You can upload this file using the File API as shown in the following example. If you are working with multimodal input, you can reference other uploaded files within your JSONL file. Python from google import genai from google.genai import types client = genai . Client () # Create a sample JSONL file with open ( "my-batch-requests.jsonl" , "w" ) as f : requests = [ { "key" : "request-1" , "request" : { "contents" : [{ "parts" : [{ "text" : "Describe the process of photosynthesis." }]}]}}, { "key" : "request-2" , "request" : { "contents" : [{ "parts" : [{ "text" : "What are the main ingredients in a Margherita pizza?" }]}]}} ] for req in requests : f . write ( json . dumps ( req ) + " \n " ) # Upload the file to the File API uploaded_file = client . files . upload ( file = 'my-batch-requests.jsonl' , config = types . UploadFileConfig ( display_name = 'my-batch-requests' , mime_type = 'jsonl' ) ) print ( f "Uploaded file: { uploaded_file . name } \ No newline at end of file diff --git a/docstore/8260d534-7c03-42c0-a377-32365840d62b b/docstore/8260d534-7c03-42c0-a377-32365840d62b new file mode 100644 index 0000000000000000000000000000000000000000..a07288c87291962aa181765ea596a2b1afe9ed3f --- /dev/null +++ b/docstore/8260d534-7c03-42c0-a377-32365840d62b @@ -0,0 +1 @@ +Model Function Calling Parallel Function Calling Compositional Function Calling Gemini 2.5 Pro ✔️ ✔️ ✔️ Gemini 2.5 Flash ✔️ ✔️ ✔️ Gemini 2.5 Flash-Lite ✔️ ✔️ ✔️ Gemini 2.0 Flash ✔️ ✔️ ✔️ Gemini 2.0 Flash-Lite X X X Best practices Function and Parameter Descriptions: Be extremely clear and specific in your descriptions. The model relies on these to choose the correct function and provide appropriate arguments. Naming: Use descriptive function names (without spaces, periods, or dashes). Strong Typing: Use specific types (integer, string, enum) for parameters to reduce errors. If a parameter has a limited set of valid values, use an enum. Tool Selection: While the model can use an arbitrary number of tools, providing too many can increase the risk of selecting an incorrect or suboptimal tool. For best results, aim to provide only the relevant tools for the context or task, ideally keeping the active set to a maximum of 10-20. Consider dynamic tool selection based on conversation context if you have a large total number of tools. Prompt Engineering: Provide context: Tell the model its role (e.g., "You are a helpful weather assistant."). Give instructions: Specify how and when to use functions (e.g., "Don't guess dates; always use a future date for forecasts."). Encourage clarification: Instruct the model to ask clarifying questions if needed. Temperature: Use a low temperature (e.g., 0) for more deterministic and reliable function calls. Validation: If a function call has significant consequences (e.g., placing an order), validate the call with the user before executing it. Error Handling : Implement robust error handling in your functions to gracefully handle unexpected inputs or API failures. Return informative error messages that the model can use to generate helpful responses to the user. Security: Be mindful of security when calling external APIs. Use appropriate authentication and authorization mechanisms. Avoid exposing sensitive data in function calls. Token \ No newline at end of file diff --git a/docstore/8266f3a1-985f-4aa4-b595-3a5b4176d738 b/docstore/8266f3a1-985f-4aa4-b595-3a5b4176d738 new file mode 100644 index 0000000000000000000000000000000000000000..bf4a48096b84622083d96343210f25866e78f754 --- /dev/null +++ b/docstore/8266f3a1-985f-4aa4-b595-3a5b4176d738 @@ -0,0 +1 @@ +a picture of me. Can you add a llama next to me?" ), & genai . Part { InlineData : & genai . Blob { MIMEType : "image/png" , Data : imgData , }, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } config := & genai . GenerateContentConfig { ResponseModalities : [] string { "TEXT" , "IMAGE" }, } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash-preview-image-generation" , contents , config , ) for _ , part := range result . Candidates [ 0 ]. Content . Parts { if part . Text != "" { fmt . Println ( part . Text ) } else if part . InlineData != nil { imageBytes := part . InlineData . Data outputFilename := "gemini_generated_image.png" _ = os . WriteFile ( outputFilename , imageBytes , 0644 ) } } } REST IMG_PATH = /path/to/your/image1.jpeg if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMG_BASE64 = $( base64 " $B64FLAGS " " $IMG_PATH " 2>&1 ) curl -X POST \ "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-preview-image-generation:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d "{ \"contents\": [{ \"parts\":[ {\"text\": \"'Hi, This is a picture of me. Can you add a llama next to me\"}, { \"inline_data\": { \"mime_type\":\"image/jpeg\", \"data\": \" $IMG_BASE64 \" } } ] }], \"generationConfig\": {\"responseModalities\": [\"TEXT\", \"IMAGE\"]} }" \ | grep -o '"data": "[^"]*"' \ | cut -d '"' -f4 \ | base64 --decode > gemini-edited-image.png Other image generation modes Gemini supports other image interaction modes based on prompt structure and context, including: Text to image(s) and text (interleaved): Outputs images with related text. Example prompt: "Generate an illustrated recipe for a paella." Image(s) and text to image(s) and text (interleaved) : Uses input images and text to create new related images and text. Example prompt: (With an image of a furnished room) \ No newline at end of file diff --git a/docstore/826827dd-3217-4eb2-bc6b-ed531b80bc13 b/docstore/826827dd-3217-4eb2-bc6b-ed531b80bc13 new file mode 100644 index 0000000000000000000000000000000000000000..987408a9d5fce99ac4990dd87d63b0d860b9553d --- /dev/null +++ b/docstore/826827dd-3217-4eb2-bc6b-ed531b80bc13 @@ -0,0 +1 @@ +overwhelming dominance of digital streaming and on-demand services; a "smart" approach would therefore require a highly specialized niche, a strong value proposition beyond just the disc, and likely a focus on community or collector markets rather than mainstream consumption. Here's a breakdown of what a smart DVD business in 2025 would likely entail: 1. Extreme Niche Specialization: You cannot compete with streaming libraries. Your business must focus on what digital doesn't offer easily or at all. This could include: * Out-of-Print & Rare Titles: Sourcing films or TV shows that have never been released digitally or are currently unavailable on major streaming platforms. .... (gemini-2.5-flash) Format responses with the completion strategy The completion strategy can also help format the response. The following example prompts the model to create an essay outline: Prompt: Create an outline for an essay about hummingbirds. Response: Here is a possible outline for an essay about hummingbirds: I. Introduction A. Hook: Start with a captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). B. Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds. C. Thesis Statement: State the main point of the essay, which could be about the remarkable adaptations, behaviors, and ecological importance of hummingbirds. II. Appearance and Identification ..... (gemini-2.5-flash) The prompt didn't specify the format for the outline and the model chose a format for you. To get the model to return an outline in a specific format, you can add text that represents the start of the outline and let the model complete it based on the pattern that you initiated. Prompt: Create an outline for an essay about hummingbirds. I. Introduction * Response: Okay, here is an outline for an essay about hummingbirds, starting from the introduction you provided: I. Introduction * Hook: Start with a \ No newline at end of file diff --git a/docstore/82715618-6589-4596-94d7-f680e9fb5150 b/docstore/82715618-6589-4596-94d7-f680e9fb5150 new file mode 100644 index 0000000000000000000000000000000000000000..a6b24f2239a055f1d65a9fc8ecf3b6c3213634db --- /dev/null +++ b/docstore/82715618-6589-4596-94d7-f680e9fb5150 @@ -0,0 +1 @@ +min | 12/2/2023 | 12/4/2023 | | Physics | In progress | 2 hrs | 12/3/2023 | 12/5/2023 | | Biology | Not started | 6 hrs | 11/2/2016 | 12/6/2023 | Prompt Model response Provide a list of all the following attributes: ingredients, type of cuisine, vegetarian or not, in JSON format ```json { "ingredients": [ "rice", "shrimp", "clams", "mussels", "peas", "tomatoes", "onions", "garlic", "olive oil", "paprika", "salt", "pepper" ], "type of cuisine": "Spanish", "vegetarian": false } ``` Put your image first for single-image prompts While Gemini can interpret image and text in any order within a prompt, placing a single image before the text prompt might lead to better results. In the following samples, you'll notice the image comes first before the text in each prompt. Troubleshooting your multimodal prompt You might need to troubleshoot your prompt if you are not getting a helpful response. Here are a few strategies you could try. If the model is not drawing information from the relevant part of the image To get a more specific response, you can point out which aspects of the image you want the prompt to draw information from to inform its response. Prompt Model response How many days will these diapers last a baby? The diapers will last for 198 days before they run out. Updated prompt Improved response How long will these diapers last before I run out? Use the weight shown on the box to determine the child's age, and use the total number of diapers in the box. Divide the total number by how many diapers the child goes through per day. The box contains 198 diapers. The box states the diapers are for a child weighing 22-37 lbs. The average child in this weight range is likely between 1 and 2 years old. A 2-year-old typically goes through 6 diapers per day, therefore, the diapers will last around 33 days (198/6). If the model output is too generic and not tailored enough to the image input To help the model tailor its response to the image(s), try asking it to describe the \ No newline at end of file diff --git a/docstore/82ba844f-cb93-4e47-b62f-5cfc85bddcc0 b/docstore/82ba844f-cb93-4e47-b62f-5cfc85bddcc0 new file mode 100644 index 0000000000000000000000000000000000000000..b0d24ed8267a7db2d3f856003571a245204928ff --- /dev/null +++ b/docstore/82ba844f-cb93-4e47-b62f-5cfc85bddcc0 @@ -0,0 +1 @@ +voice name from the prebuilt output voices . This example saves the output audio from the model in a wave file: Python from google import genai from google.genai import types import wave # Set up the wave file to save the output: def wave_file ( filename , pcm , channels = 1 , rate = 24000 , sample_width = 2 ): with wave . open ( filename , "wb" ) as wf : wf . setnchannels ( channels ) wf . setsampwidth ( sample_width ) wf . setframerate ( rate ) wf . writeframes ( pcm ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = "Say cheerfully: Have a wonderful day!" , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), ) ) data = response . candidates [ 0 ] . content . parts [ 0 ] . inline_data . data file_name = 'out.wav' wave_file ( file_name , data ) # Saves the file to current directory For more code samples, refer to the "TTS - Get Started" file in the cookbooks repository: View on GitHub JavaScript import { GoogleGenAI } from '@google/genai' ; import wav from 'wav' ; async function saveWaveFile ( filename , pcmData , channels = 1 , rate = 24000 , sampleWidth = 2 , ) { return new Promise (( resolve , reject ) = > { const writer = new wav . FileWriter ( filename , { channels , sampleRate : rate , bitDepth : sampleWidth * 8 , }); writer . on ( 'finish' , resolve ); writer . on ( 'error' , reject ); writer . write ( pcmData ); writer . end (); }); } async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : [{ parts : [{ text : 'Say cheerfully: Have a wonderful day!' }] }], config : { responseModalities : [ 'AUDIO' ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Kore' }, }, }, }, }); \ No newline at end of file diff --git a/docstore/82ea2d16-fef3-408d-90cc-c7edc038b952 b/docstore/82ea2d16-fef3-408d-90cc-c7edc038b952 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/82ea2d16-fef3-408d-90cc-c7edc038b952 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/8315209b-5475-4002-990d-67e05f471f33 b/docstore/8315209b-5475-4002-990d-67e05f471f33 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/8315209b-5475-4002-990d-67e05f471f33 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/83339907-25bc-41e0-9cf8-71df31954734 b/docstore/83339907-25bc-41e0-9cf8-71df31954734 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/83339907-25bc-41e0-9cf8-71df31954734 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/833d47ed-3714-4f4d-9745-59f708aa217e b/docstore/833d47ed-3714-4f4d-9745-59f708aa217e new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/833d47ed-3714-4f4d-9745-59f708aa217e @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/83520a25-8217-4a31-a818-5fffcc37f2cd b/docstore/83520a25-8217-4a31-a818-5fffcc37f2cd new file mode 100644 index 0000000000000000000000000000000000000000..364e8cf96b8203b386c16dcd68715ac157e4ab45 --- /dev/null +++ b/docstore/83520a25-8217-4a31-a818-5fffcc37f2cd @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/migrate#client Title: Migrate to the Google GenAI SDK | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/8363dbab-d562-451a-a166-b40d05110322 b/docstore/8363dbab-d562-451a-a166-b40d05110322 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/8363dbab-d562-451a-a166-b40d05110322 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/8392195c-ab52-4f87-a06f-8f5179b7145d b/docstore/8392195c-ab52-4f87-a06f-8f5179b7145d new file mode 100644 index 0000000000000000000000000000000000000000..1b590529498df147a73d1ae4a22f439d08a36cb6 --- /dev/null +++ b/docstore/8392195c-ab52-4f87-a06f-8f5179b7145d @@ -0,0 +1 @@ +from "node:fs" ; const ai = new GoogleGenAI ({}); const base64ImageFile = fs . readFileSync ( "path/to/small-sample.jpg" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "image/jpeg" , data : base64ImageFile , }, }, { text : "Caption this image." }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go bytes , _ := os . ReadFile ( "path/to/small-sample.jpg" ) parts := [] * genai . Part { genai . NewPartFromBytes ( bytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST IMG_PATH = "/path/to/your/image1.jpg" if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "' " $( base64 $B64FLAGS $IMG_PATH ) " '" } }, {"text": "Caption this image."}, ] }] }' 2 > /dev/null You can also fetch an image from a URL, convert it to bytes, and pass it to generateContent as shown in the following examples. Python from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests . get ( image_path ) . content image = types . Part . from_bytes ( data = image_bytes , mime_type = "image/jpeg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is this image?" , image ], ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; \ No newline at end of file diff --git a/docstore/83a2ce69-92b0-433a-84f0-a370a2f794e9 b/docstore/83a2ce69-92b0-433a-84f0-a370a2f794e9 new file mode 100644 index 0000000000000000000000000000000000000000..9139684952e56c77fdce37ab354efce80520ded1 --- /dev/null +++ b/docstore/83a2ce69-92b0-433a-84f0-a370a2f794e9 @@ -0,0 +1 @@ +dict ). When possible, the SDK will parse the returned JSON, and return the result in response.parsed . If you provided a pydantic class as the schema the SDK will convert that JSON to an instance of the class. from google import genai from pydantic import BaseModel client = genai . Client () class CountryInfo ( BaseModel ): name : str population : int capital : str continent : str major_cities : list [ str ] gdp : int official_language : str total_area_sq_mi : int response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'Give me information of the United States.' , config = { 'response_mime_type' : 'application/json' , 'response_schema' : CountryInfo , }, ) response . parsed JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "List a few popular cookie recipes." , config : { responseMimeType : "application/json" , responseSchema : { type : "array" , items : { type : "object" , properties : { recipeName : { type : "string" }, ingredients : { type : "array" , items : { type : "string" } }, }, required : [ "recipeName" , "ingredients" ], }, }, }, }); console . log ( response . text ); Files Upload Upload a file: Before Python import requests import pathlib import google.generativeai as genai # Download file response = requests . get ( 'https://storage.googleapis.com/generativeai-downloads/data/a11.txt' ) pathlib . Path ( 'a11.txt' ) . write_text ( response . text ) file = genai . upload_file ( path = 'a11.txt' ) model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ([ 'Can you summarize this file:' , my_file ]) print ( response . text ) After Python import requests import pathlib from google import genai client = genai . Client () # Download file response = requests . get ( 'https://storage.googleapis.com/generativeai-downloads/data/a11.txt' ) \ No newline at end of file diff --git a/docstore/83aacee6-6fab-4ac9-a128-18894be5928c b/docstore/83aacee6-6fab-4ac9-a128-18894be5928c new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/83aacee6-6fab-4ac9-a128-18894be5928c @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/83ad6237-68c5-4db2-9b08-34840aad2d4b b/docstore/83ad6237-68c5-4db2-9b08-34840aad2d4b new file mode 100644 index 0000000000000000000000000000000000000000..c651df5a8dc16b6ecacfcfbb978b8a98fa66456b --- /dev/null +++ b/docstore/83ad6237-68c5-4db2-9b08-34840aad2d4b @@ -0,0 +1 @@ +Live API capabilities guide | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Live API capabilities guide Preview: The Live API is in preview. This is a comprehensive guide that covers capabilities and configurations available with the Live API. See Get started with Live API page for a overview and sample code for common use cases. Before you begin Familiarize yourself with core concepts: If you haven't already done so, read the Get started with Live API page first. This will introduce you to the fundamental principles of the Live API, how it works, and the distinction between the different models and their corresponding audio generation methods ( native audio or half-cascade). Try the Live API in AI Studio: You may find it useful to try the Live API in Google AI Studio before you start building. To use the Live API in Google AI Studio, select Stream . Establishing a connection The following example shows how to create a connection with an API key: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : print ( "Session started" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function main () { const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug \ No newline at end of file diff --git a/docstore/83db957d-abd1-4193-af00-596d5b3891e7 b/docstore/83db957d-abd1-4193-af00-596d5b3891e7 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/83db957d-abd1-4193-af00-596d5b3891e7 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/84114c0c-564c-46bc-b04c-6c153c876860 b/docstore/84114c0c-564c-46bc-b04c-6c153c876860 new file mode 100644 index 0000000000000000000000000000000000000000..c6b31d98c2b16f02ca76b6157fd854c0b53727af --- /dev/null +++ b/docstore/84114c0c-564c-46bc-b04c-6c153c876860 @@ -0,0 +1 @@ +: [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Thinking with Gemini 2.5 2.5 Flash and Pro models have "thinking" enabled by default to enhance quality, which may take longer to run and increase token usage. When using 2.5 Flash, you can disable thinking by setting the thinking budget to zero. For more details, see the thinking guide . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # Disables thinking ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , config : { thinkingConfig : { thinkingBudget : 0 , // Disables thinking }, } }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "How does AI work?" ), & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : int32 ( 0 ), // Disables thinking }, } ) \ No newline at end of file diff --git a/docstore/843522b6-6c97-49b9-8609-33af99273b0b b/docstore/843522b6-6c97-49b9-8609-33af99273b0b new file mode 100644 index 0000000000000000000000000000000000000000..771c2c741948f29f5c3605e7090d7f1d54bfcf1f --- /dev/null +++ b/docstore/843522b6-6c97-49b9-8609-33af99273b0b @@ -0,0 +1 @@ +For example, assume that you're developing an application to classify musical instruments into one of five categories: "Percussion" , "String" , "Woodwind" , "Brass" , or " "Keyboard" ". You could create an enum to help with this task. In the following example, you pass an enum as the responseSchema , constraining the model to choose the most appropriate option. Python from google import genai import enum class Instrument ( enum . Enum ): PERCUSSION = "Percussion" STRING = "String" WOODWIND = "Woodwind" BRASS = "Brass" KEYBOARD = "Keyboard" client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = 'What type of instrument is an oboe?' , config = { 'response_mime_type' : 'text/x.enum' , 'response_schema' : Instrument , }, ) print ( response . text ) # Woodwind JavaScript import { GoogleGenAI , Type } from "@google/genai" ; const ai = new GoogleGenAI ({}); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "What type of instrument is an oboe?" , config : { responseMimeType : "text/x.enum" , responseSchema : { type : Type . STRING , enum : [ "Percussion" , "String" , "Woodwind" , "Brass" , "Keyboard" ], }, }, }); console . log ( response . text ); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "contents": [{ "parts":[ { "text": "What type of instrument is an oboe?" } ] }], "generationConfig": { "responseMimeType": "text/x.enum", "responseSchema": { "type": "STRING", "enum": ["Percussion", "String", "Woodwind", "Brass", "Keyboard"] } } }' The Python library will translate the type declarations for the API. However, the API accepts a subset of the OpenAPI 3.0 schema ( Schema ). There are two other ways to specify an enumeration. You can use a Literal : ``` Python Literal [ "Percussion" , "String" , "Woodwind" , "Brass" , "Keyboard" ] \ No newline at end of file diff --git a/docstore/8435e407-2664-42bd-b212-783b938dfc29 b/docstore/8435e407-2664-42bd-b212-783b938dfc29 new file mode 100644 index 0000000000000000000000000000000000000000..13654164ec6e7048af241ec2efbc514257c312b5 --- /dev/null +++ b/docstore/8435e407-2664-42bd-b212-783b938dfc29 @@ -0,0 +1 @@ +For alternative methods of providing images and more advanced image processing, see our image understanding guide . The API also supports document , video , and audio inputs and understanding. Streaming responses By default, the model returns a response only after the entire generation process is complete. For more fluid interactions, use streaming to receive GenerateContentResponse instances incrementally as they're generated. Python from google import genai client = genai . Client () response = client . models . generate_content_stream ( model = "gemini-2.5-flash" , contents = [ "Explain how AI works" ] ) for chunk in response : print ( chunk . text , end = "" ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , }); for await ( const chunk of response ) { console . log ( chunk . text ); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } stream := client . Models . GenerateContentStream ( ctx , "gemini-2.5-flash" , genai . Text ( "Write a story about a magic backpack." ), nil , ) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt=sse" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ --no-buffer \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main \ No newline at end of file diff --git a/docstore/843e73fd-d92d-483a-8a74-186ad4737516 b/docstore/843e73fd-d92d-483a-8a74-186ad4737516 new file mode 100644 index 0000000000000000000000000000000000000000..6479a4b50897c899a1b9742e0d69348c2776f1d5 --- /dev/null +++ b/docstore/843e73fd-d92d-483a-8a74-186ad4737516 @@ -0,0 +1 @@ +config = types . GenerateImagesConfig ( aspect_ratio = "16:9" , number_of_images = 1 ) ) imagen . generated_images [ 0 ] . image JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const response = await ai . models . generateImages ({ model : "imagen-3.0-generate-002" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , config : { numberOfImages : 1 , }, }); // you'll pass response.generatedImages[0].image.imageBytes to Veo Go package main import ( "context" "fmt" "os" "time" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateImagesConfig { AspectRatio : "16:9" , NumberOfImages : 1 , } response , _ := client . Models . GenerateImages ( ctx , "imagen-3.0-generate-002" , "Panning wide shot of a calico kitten sleeping in the sunshine" , config , ) // you'll pass response.GeneratedImages[0].Image to Veo } Then, generate a video using the resulting image as the first frame: Python operation = client . models . generate_videos ( model = "veo-2.0-generate-001" , prompt = prompt , image = imagen . generated_images [ 0 ] . image , config = types . GenerateVideosConfig ( person_generation = "dont_allow" , # "dont_allow" or "allow_adult" aspect_ratio = "16:9" , # "16:9" or "9:16" number_of_videos = 2 ), ) # Wait for videos to generate while not operation . done : time . sleep ( 20 ) operation = client . operations . get ( operation ) for n , video in enumerate ( operation . response . generated_videos ): fname = f 'with_image_input { n } .mp4' print ( fname ) client . files . download ( file = video . video ) video . video . save ( fname ) JavaScript import { GoogleGenAI } from "@google/genai" ; import { createWriteStream } from "fs" ; import { Readable } from "stream" ; const ai = new GoogleGenAI ({}); async function main () { // get image bytes from Imagen, as shown above let \ No newline at end of file diff --git a/docstore/84437429-87e7-4427-8ae4-d4fd3c3ab4be b/docstore/84437429-87e7-4427-8ae4-d4fd3c3ab4be new file mode 100644 index 0000000000000000000000000000000000000000..c1222b1eb00e14a7d2a482f186a5d8fda014fef3 --- /dev/null +++ b/docstore/84437429-87e7-4427-8ae4-d4fd3c3ab4be @@ -0,0 +1 @@ +person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own?` ; let thoughts = "" ; let answer = "" ; async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-pro" , contents : prompt , config : { thinkingConfig : { includeThoughts : true , }, }, }); for await ( const chunk of response ) { for ( const part of chunk . candidates [ 0 ]. content . parts ) { if ( ! part . text ) { continue ; } else if ( part . thought ) { if ( ! thoughts ) { console . log ( "Thoughts summary:" ); } console . log ( part . text ); thoughts = thoughts + part . text ; } else { if ( ! answer ) { console . log ( "Answer:" ); } console . log ( part . text ); answer = answer + part . text ; } } } } await main (); Go package main import ( "context" "fmt" "log" "os" "google.golang.org/genai" ) const prompt = ` Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own? ` func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := genai . Text ( prompt ) model := "gemini-2.5-pro" resp := client . Models . GenerateContentStream ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { IncludeThoughts : true , }, }) for chunk := range resp { for _ , part := range chunk . Candidates [ 0 ]. Content . Parts { if len ( part . Text ) == 0 { continue } if part . Thought { fmt . Printf ( "Thought: %s\n" , part . Text ) } else { fmt . Printf ( "Answer: %s\n" , part . Text ) } } } } Thought signatures \ No newline at end of file diff --git a/docstore/8487add9-bffd-4387-967c-127732ce8297 b/docstore/8487add9-bffd-4387-967c-127732ce8297 new file mode 100644 index 0000000000000000000000000000000000000000..58a0a85d29a664bbdbdb8d3a93c58460ef8ef554 --- /dev/null +++ b/docstore/8487add9-bffd-4387-967c-127732ce8297 @@ -0,0 +1 @@ += []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); And here is a text-to-audio example. You can receive audio by setting AUDIO as response modality. This example saves the received data as WAV file: \ No newline at end of file diff --git a/docstore/848b072d-3ecd-4834-96fb-7d5dd7d1f7ab b/docstore/848b072d-3ecd-4834-96fb-7d5dd7d1f7ab new file mode 100644 index 0000000000000000000000000000000000000000..4698c2cf5d2dc524303259a813fe032a26136eee --- /dev/null +++ b/docstore/848b072d-3ecd-4834-96fb-7d5dd7d1f7ab @@ -0,0 +1 @@ +blurring the background into a sea of neon colors and indistinct shadows, creating a sense of urgency and isolation. A more detailed prompt results in a video that is more focused with a richer environment. A video with smooth motion that dollies in on a desperate man in a green trench coat, using a vintage rotary phone against a wall bathed in an eerie green neon glow. The camera starts from a medium distance, slowly moving closer to the man's face, revealing his frantic expression and the sweat on his brow as he urgently dials the phone. The focus is on the man's hands, his fingers fumbling with the dial as he desperately tries to connect. The green neon light casts long shadows on the wall, adding to the tense atmosphere. The scene is framed to emphasize the isolation and desperation of the man, highlighting the stark contrast between the vibrant glow of the neon and the man's grim determination. Adding more detail gives the subject a realistic expression and creates an intense and vibrant scene. Snow leopard This example demonstrates the output Veo might generate for a simple prompt. Prompt Generated output A cute creature with snow leopard-like fur is walking in winter forest, 3D cartoon style render. Running snow leopard This prompt has more detail and demonstrates generated output that might be closer to what you want in your video. Prompt Generated output Create a short 3D animated scene in a joyful cartoon style. A cute creature with snow leopard-like fur, large expressive eyes, and a friendly, rounded form happily prances through a whimsical winter forest. The scene should feature rounded, snow-covered trees, gentle falling snowflakes, and warm sunlight filtering through the branches. The creature's bouncy movements and wide smile should convey pure delight. Aim for an upbeat, heartwarming tone with bright, cheerful colors and playful animation. Examples by writing elements These examples show you how to refine your prompts by each basic element. Subject \ No newline at end of file diff --git a/docstore/84a9edc7-f0e7-4840-be5b-8b17cddf0fda b/docstore/84a9edc7-f0e7-4840-be5b-8b17cddf0fda new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/84a9edc7-f0e7-4840-be5b-8b17cddf0fda @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/84b4dac7-03b4-4d0a-8374-392d5bb9b8e5 b/docstore/84b4dac7-03b4-4d0a-8374-392d5bb9b8e5 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/84b4dac7-03b4-4d0a-8374-392d5bb9b8e5 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/84cc09bd-cb86-49ba-893c-d7ccb9e41293 b/docstore/84cc09bd-cb86-49ba-893c-d7ccb9e41293 new file mode 100644 index 0000000000000000000000000000000000000000..bbc4019685cdf16085ca79e5df30b3ebeb71657f --- /dev/null +++ b/docstore/84cc09bd-cb86-49ba-893c-d7ccb9e41293 @@ -0,0 +1 @@ +"role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . server_content . model_turn : print ( "Model turn:" , response . server_content . model_turn ) if response . server_content . output_transcription : print ( "Transcript:" , response . server_content . output_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ], outputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . debug ( 'Received output transcription: %s\n' , turn . serverContent . outputTranscription . text ); } } session . close (); } async function \ No newline at end of file diff --git a/docstore/84cf1ffd-8b81-435b-88fe-75b65b0a07c5 b/docstore/84cf1ffd-8b81-435b-88fe-75b65b0a07c5 new file mode 100644 index 0000000000000000000000000000000000000000..a3f65441166ab1d39d6f723c5ce4c04b54ac95e6 --- /dev/null +++ b/docstore/84cf1ffd-8b81-435b-88fe-75b65b0a07c5 @@ -0,0 +1 @@ +values, use an enum. Tool Selection: While the model can use an arbitrary number of tools, providing too many can increase the risk of selecting an incorrect or suboptimal tool. For best results, aim to provide only the relevant tools for the context or task, ideally keeping the active set to a maximum of 10-20. Consider dynamic tool selection based on conversation context if you have a large total number of tools. Prompt Engineering: Provide context: Tell the model its role (e.g., "You are a helpful weather assistant."). Give instructions: Specify how and when to use functions (e.g., "Don't guess dates; always use a future date for forecasts."). Encourage clarification: Instruct the model to ask clarifying questions if needed. Temperature: Use a low temperature (e.g., 0) for more deterministic and reliable function calls. Validation: If a function call has significant consequences (e.g., placing an order), validate the call with the user before executing it. Error Handling : Implement robust error handling in your functions to gracefully handle unexpected inputs or API failures. Return informative error messages that the model can use to generate helpful responses to the user. Security: Be mindful of security when calling external APIs. Use appropriate authentication and authorization mechanisms. Avoid exposing sensitive data in function calls. Token Limits: Function descriptions and parameters count towards your input token limit. If you're hitting token limits, consider limiting the number of functions or the length of the descriptions, break down complex tasks into smaller, more focused function sets. Notes and limitations Only a subset of the OpenAPI schema is supported. Supported parameter types in Python are limited. Automatic function calling is a Python SDK feature only. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License \ No newline at end of file diff --git a/docstore/84e0aecf-4a2f-480f-b731-7786030c8618 b/docstore/84e0aecf-4a2f-480f-b731-7786030c8618 new file mode 100644 index 0000000000000000000000000000000000000000..d464a7e5141c7bcc5fa86ba919979db27614ba5c --- /dev/null +++ b/docstore/84e0aecf-4a2f-480f-b731-7786030c8618 @@ -0,0 +1 @@ +Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/84e39632-e35e-4ebb-b0f6-2f364c0468cf b/docstore/84e39632-e35e-4ebb-b0f6-2f364c0468cf new file mode 100644 index 0000000000000000000000000000000000000000..ca0d44e9961a5c85cb8aeb7b443a141c9784fb0d --- /dev/null +++ b/docstore/84e39632-e35e-4ebb-b0f6-2f364c0468cf @@ -0,0 +1 @@ +Image understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image understanding Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models. Tip: In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer improved accuracy for specific use cases like object detection and segmentation , through additional training. See the Capabilities section for more details. Passing images to Gemini You can provide images as input to Gemini using two methods: Passing inline image data : Ideal for smaller files (total request size less than 20MB, including prompts). Uploading images using the File API : Recommended for larger files or for reusing images across multiple requests. Passing inline image data You can pass inline image data in the request to generateContent . You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language). The following example shows how to read an image from a local file and pass it to generateContent API for processing. Python from google.genai import types with open ( 'path/to/small-sample.jpg' , 'rb' ) as f : image_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ types . Part . from_bytes ( data = image_bytes , mime_type = 'image/jpeg' , ), 'Caption this image.' ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs \ No newline at end of file diff --git a/docstore/84fb1564-23d5-4f95-8d66-7c00e3b4146d b/docstore/84fb1564-23d5-4f95-8d66-7c00e3b4146d new file mode 100644 index 0000000000000000000000000000000000000000..b1044b06e974ef70df5275060bd78c27b49af935 --- /dev/null +++ b/docstore/84fb1564-23d5-4f95-8d66-7c00e3b4146d @@ -0,0 +1 @@ +ordering of the examples is not consistent with the property ordering of the schema, the output could be rambling or unexpected. To ensure a consistent, predictable ordering of properties, you can use the optional propertyOrdering[] field. "propertyOrdering" : [ "recipeName" , "ingredients" ] propertyOrdering[] – not a standard field in the OpenAPI specification – is an array of strings used to determine the order of properties in the response. By specifying the order of properties and then providing examples with properties in that same order, you can potentially improve the quality of results. propertyOrdering is only supported when you manually create types.Schema . Schemas in Python When you're using the Python library, the value of response_schema must be one of the following: A type, as you would use in a type annotation (see the Python typing module ) An instance of genai.types.Schema The dict equivalent of genai.types.Schema The easiest way to define a schema is with a Pydantic type (as shown in the previous example): Python config = { 'response_mime_type' : 'application/json' , 'response_schema' : list [ Recipe ]} When you use a Pydantic type, the Python library builds out a JSON schema for you and sends it to the API. For additional examples, see the Python library docs . The Python library supports schemas defined with the following types (where AllowedType is any allowed type): int float bool str list[AllowedType] AllowedType|AllowedType|... For structured types: dict[str, AllowedType] . This annotation declares all dict values to be the same type, but doesn't specify what keys should be included. User-defined Pydantic models . This approach lets you specify the key names and define different types for the values associated with each of the keys, including nested structures. JSON Schema support JSON Schema is a more recent specification than OpenAPI 3.0, which the Schema object is based on. Support for JSON Schema is available as a preview using the \ No newline at end of file diff --git a/docstore/84fbb490-726d-420a-9819-fa1af912a186 b/docstore/84fbb490-726d-420a-9819-fa1af912a186 new file mode 100644 index 0000000000000000000000000000000000000000..35cb4724ff25e15eeabaea84a051d2545b73055e --- /dev/null +++ b/docstore/84fbb490-726d-420a-9819-fa1af912a186 @@ -0,0 +1 @@ +model to produce concise responses, you can include examples in the prompt that give preference to concise responses. The following prompt provides two examples that show preference to the shorter explanations. In the response, you can see that the examples guided the model to choose the shorter explanation ( Explanation2 ) as opposed to the longer explanation ( Explanation1 ) like it did previously. Prompt: Below are some examples showing a question, explanation, and answer format: Question: Why is the sky blue? Explanation1: The sky appears blue because of Rayleigh scattering, which causes shorter blue wavelengths of light to be scattered more easily than longer red wavelengths, making the sky look blue. Explanation2: Due to Rayleigh scattering effect. Answer: Explanation2 Question: What is the cause of earthquakes? Explanation1: Sudden release of energy in the Earth's crust. Explanation2: Earthquakes happen when tectonic plates suddenly slip or break apart, causing a release of energy that creates seismic waves that can shake the ground and cause damage. Answer: Explanation1 Now, Answer the following question given the example formats above: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Answer: Explanation2 (gemini-2.5-flash) Optimal number of examples Models like Gemini can often pick up on patterns using a few examples, though you may need to experiment with the number of examples to provide in the prompt for the best results. At the same time, if you include too many examples, the model may start to overfit the response to the examples. Patterns vs anti patterns Using examples to show the model a pattern to follow is more effective than using examples to show the model an anti pattern \ No newline at end of file diff --git a/docstore/850b846f-00e1-41b3-bbb9-07af0f1f52f6 b/docstore/850b846f-00e1-41b3-bbb9-07af0f1f52f6 new file mode 100644 index 0000000000000000000000000000000000000000..3c1b7a7f28e24faa230d65b7608a5733d7c64407 --- /dev/null +++ b/docstore/850b846f-00e1-41b3-bbb9-07af0f1f52f6 @@ -0,0 +1 @@ +single prompt by including multiple image Part objects in the contents array. These can be a mix of inline data (local files or URLs) and File API references. Python from google import genai from google.genai import types client = genai . Client () # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client . files . upload ( file = image1_path ) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open ( image2_path , 'rb' ) as f : img2_bytes = f . read () # Create the prompt with text and multiple images response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is different between these two images?" , uploaded_file , # Use the uploaded file reference types . Part . from_bytes ( data = img2_bytes , mime_type = 'image/png' ) ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); async function main () { // Upload the first image const image1_path = "path/to/image1.jpg" ; const uploadedFile = await ai . files . upload ({ file : image1_path , config : { mimeType : "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png" ; const base64Image2File = fs . readFileSync ( image2_path , { encoding : "base64" , }); // Create the prompt with text and multiple images const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ "What is different between these two images?" , createPartFromUri ( uploadedFile . uri , uploadedFile . mimeType ), { inlineData : { mimeType : "image/png" , data : base64Image2File , }, }, ]), }); console . log ( response . text ); } await main (); Go // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile , _ := client . Files . UploadFromPath ( ctx , image1Path , nil ) // Prepare the second image as inline \ No newline at end of file diff --git a/docstore/855be367-4aef-4874-bdb6-9aac3170e587 b/docstore/855be367-4aef-4874-bdb6-9aac3170e587 new file mode 100644 index 0000000000000000000000000000000000000000..8b1657bd0e8a588bad85661db832fbb0d660fdd6 --- /dev/null +++ b/docstore/855be367-4aef-4874-bdb6-9aac3170e587 @@ -0,0 +1 @@ +serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Incremental content updates Use incremental updates to send text input, establish session context, or restore session context. For short contexts you can send turn-by-turn interactions to represent the exact sequence of events: Python turns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }]}, { "role" : "model" , "parts" : [{ "text" : "Paris" }]}, ] await session . send_client_content ( turns = turns , turn_complete = False ) turns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }]}] await session . send_client_content ( turns = turns , turn_complete = True ) JavaScript let inputTurns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }] }, { "role" : "model" , "parts" : [{ "text" : "Paris" }] }, ] session . sendClientContent ({ turns : inputTurns , turnComplete : false }) inputTurns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }] }] session . sendClientContent ({ turns : inputTurns , turnComplete : true }) For longer contexts \ No newline at end of file diff --git a/docstore/855e0898-fede-413c-a2e6-b7e6eed03ffe b/docstore/855e0898-fede-413c-a2e6-b7e6eed03ffe new file mode 100644 index 0000000000000000000000000000000000000000..79af4391feda972140ebff71bc9d49f207aa048c --- /dev/null +++ b/docstore/855e0898-fede-413c-a2e6-b7e6eed03ffe @@ -0,0 +1 @@ +Long context | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Long context Many Gemini models come with large context windows of 1 million or more tokens. Historically, large language models (LLMs) were significantly limited by the amount of text (or tokens) that could be passed to the model at one time. The Gemini long context window unlocks many new use cases and developer paradigms. The code you already use for cases like text generation or multimodal inputs will work without any changes with long context. This document gives you an overview of what you can achieve using models with context windows of 1M and more tokens. The page gives a brief overview of a context window, and explores how developers should think about long context, various real world use cases for long context, and ways to optimize the usage of long context. For the context window sizes of specific models, see the Models page. What is a context window? The basic way you use the Gemini models is by passing information (context) to the model, which will subsequently generate a response. An analogy for the context window is short term memory. There is a limited amount of information that can be stored in someone's short term memory, and the same is true for generative models. You can read more about how models work under the hood in our generative models guide . Getting started with long context Earlier versions of generative models were only able to process 8,000 tokens at a time. Newer models pushed this further by accepting 32,000 or even 128,000 tokens. Gemini is the first model capable of accepting 1 million tokens. In practice, 1 million tokens would look like: \ No newline at end of file diff --git a/docstore/8561e2cf-9565-425f-b43d-80bdc0f15959 b/docstore/8561e2cf-9565-425f-b43d-80bdc0f15959 new file mode 100644 index 0000000000000000000000000000000000000000..fdb424117e2ce912b4647b8c4a13b58690bfa0f7 --- /dev/null +++ b/docstore/8561e2cf-9565-425f-b43d-80bdc0f15959 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/vision#inline-image Title: Image understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/856e9b70-4004-41fd-985a-0deb09be95f1 b/docstore/856e9b70-4004-41fd-985a-0deb09be95f1 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/856e9b70-4004-41fd-985a-0deb09be95f1 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/8573d521-00a1-4480-a74d-3a5ee6762a67 b/docstore/8573d521-00a1-4480-a74d-3a5ee6762a67 new file mode 100644 index 0000000000000000000000000000000000000000..ca0d44e9961a5c85cb8aeb7b443a141c9784fb0d --- /dev/null +++ b/docstore/8573d521-00a1-4480-a74d-3a5ee6762a67 @@ -0,0 +1 @@ +Image understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image understanding Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models. Tip: In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer improved accuracy for specific use cases like object detection and segmentation , through additional training. See the Capabilities section for more details. Passing images to Gemini You can provide images as input to Gemini using two methods: Passing inline image data : Ideal for smaller files (total request size less than 20MB, including prompts). Uploading images using the File API : Recommended for larger files or for reusing images across multiple requests. Passing inline image data You can pass inline image data in the request to generateContent . You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language). The following example shows how to read an image from a local file and pass it to generateContent API for processing. Python from google.genai import types with open ( 'path/to/small-sample.jpg' , 'rb' ) as f : image_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ types . Part . from_bytes ( data = image_bytes , mime_type = 'image/jpeg' , ), 'Caption this image.' ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs \ No newline at end of file diff --git a/docstore/85744f44-7eb7-4f95-a57d-e915990929ad b/docstore/85744f44-7eb7-4f95-a57d-e915990929ad new file mode 100644 index 0000000000000000000000000000000000000000..696658722d20a37964165e2b0fc6d9de79b0d2b8 --- /dev/null +++ b/docstore/85744f44-7eb7-4f95-a57d-e915990929ad @@ -0,0 +1 @@ +main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); You can enable transcription of the audio input by sending input_audio_transcription in setup config. Python import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ], "input_audio_transcription" : {}, } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_data = Path ( "16000.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_data , mime_type = 'audio/pcm;rate=16000' ) ) async for msg in session . receive (): if msg . server_content . input_transcription : print ( 'Transcript:' , msg . server_content . input_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ], inputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( \ No newline at end of file diff --git a/docstore/860c3b45-b110-4feb-8592-b1368e6dc8a4 b/docstore/860c3b45-b110-4feb-8592-b1368e6dc8a4 new file mode 100644 index 0000000000000000000000000000000000000000..185cd7c14b73dd4804292716b4231cde98556b13 --- /dev/null +++ b/docstore/860c3b45-b110-4feb-8592-b1368e6dc8a4 @@ -0,0 +1 @@ +Safety settings | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Safety settings The Gemini API provides safety settings that you can adjust during the prototyping stage to determine if your application requires more or less restrictive safety configuration. You can adjust these settings across five filter categories to restrict or allow certain types of content. This guide covers how the Gemini API handles safety settings and filtering and how you can change the safety settings for your application. Note: Applications that use less restrictive safety settings may be subject to review. See the Terms of Service for more information. Safety filters The Gemini API's adjustable safety filters cover the following categories: Category Description Harassment Negative or harmful comments targeting identity and/or protected attributes. Hate speech Content that is rude, disrespectful, or profane. Sexually explicit Contains references to sexual acts or other lewd content. Dangerous Promotes, facilitates, or encourages harmful acts. Civic integrity Election-related queries. These categories are defined in HarmCategory . The Gemini models only support HARM_CATEGORY_HARASSMENT , HARM_CATEGORY_HATE_SPEECH , HARM_CATEGORY_SEXUALLY_EXPLICIT , HARM_CATEGORY_DANGEROUS_CONTENT , and HARM_CATEGORY_CIVIC_INTEGRITY . All other categories are used only by PaLM 2 (Legacy) models. You can use these filters to adjust what's appropriate for your use case. For example, if you're building video game dialogue, you may deem it acceptable to allow more content that's rated as Dangerous due to the nature of the game. In addition to the adjustable safety filters, the \ No newline at end of file diff --git a/docstore/86258006-b40b-4fc2-bd81-7ac5968ae875 b/docstore/86258006-b40b-4fc2-bd81-7ac5968ae875 new file mode 100644 index 0000000000000000000000000000000000000000..63d9f8f1c5d6f296f9334ea760bebfcc6dc4a24c --- /dev/null +++ b/docstore/86258006-b40b-4fc2-bd81-7ac5968ae875 @@ -0,0 +1 @@ +Gemini thinking | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini thinking The Gemini 2.5 series models use an internal "thinking process" that significantly improves their reasoning and multi-step planning abilities, making them highly effective for complex tasks such as coding, advanced mathematics, and data analysis. This guide shows you how to work with Gemini's thinking capabilities using the Gemini API. Before you begin Ensure you use a supported 2.5 series model for thinking. You might find it beneficial to explore these models in AI Studio before diving into the API: Try Gemini 2.5 Flash in AI Studio Try Gemini 2.5 Pro in AI Studio Try Gemini 2.5 Flash-Lite Preview in AI Studio Generating content with thinking Initiating a request with a thinking model is similar to any other content generation request. The key difference lies in specifying one of the models with thinking support in the model field, as demonstrated in the following text generation example: Python from google import genai client = genai . Client () prompt = "Explain the concept of Occam's Razor and provide a simple, everyday example." response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = prompt ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const prompt = "Explain the concept of Occam's Razor and provide a simple, everyday example." ; const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : prompt , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" \ No newline at end of file diff --git a/docstore/862b2df0-b4df-4656-aa23-a186e3666e8f b/docstore/862b2df0-b4df-4656-aa23-a186e3666e8f new file mode 100644 index 0000000000000000000000000000000000000000..3c1b7a7f28e24faa230d65b7608a5733d7c64407 --- /dev/null +++ b/docstore/862b2df0-b4df-4656-aa23-a186e3666e8f @@ -0,0 +1 @@ +single prompt by including multiple image Part objects in the contents array. These can be a mix of inline data (local files or URLs) and File API references. Python from google import genai from google.genai import types client = genai . Client () # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client . files . upload ( file = image1_path ) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open ( image2_path , 'rb' ) as f : img2_bytes = f . read () # Create the prompt with text and multiple images response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is different between these two images?" , uploaded_file , # Use the uploaded file reference types . Part . from_bytes ( data = img2_bytes , mime_type = 'image/png' ) ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); async function main () { // Upload the first image const image1_path = "path/to/image1.jpg" ; const uploadedFile = await ai . files . upload ({ file : image1_path , config : { mimeType : "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png" ; const base64Image2File = fs . readFileSync ( image2_path , { encoding : "base64" , }); // Create the prompt with text and multiple images const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ "What is different between these two images?" , createPartFromUri ( uploadedFile . uri , uploadedFile . mimeType ), { inlineData : { mimeType : "image/png" , data : base64Image2File , }, }, ]), }); console . log ( response . text ); } await main (); Go // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile , _ := client . Files . UploadFromPath ( ctx , image1Path , nil ) // Prepare the second image as inline \ No newline at end of file diff --git a/docstore/862c104e-2b6e-4c4d-9d76-0f4a12458e10 b/docstore/862c104e-2b6e-4c4d-9d76-0f4a12458e10 new file mode 100644 index 0000000000000000000000000000000000000000..af04e93a60e88c829b3aa07a575f51b8b1fa9ccf --- /dev/null +++ b/docstore/862c104e-2b6e-4c4d-9d76-0f4a12458e10 @@ -0,0 +1 @@ +TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : false , // default startOfSpeechSensitivity : StartSensitivity . START_SENSITIVITY_LOW , endOfSpeechSensitivity : EndSensitivity . END_SENSITIVITY_LOW , prefixPaddingMs : 20 , silenceDurationMs : 100 , } } }; Disable automatic VAD Alternatively, the automatic VAD can be disabled by setting realtimeInputConfig.automaticActivityDetection.disabled to true in the setup message. In this configuration the client is responsible for detecting user speech and sending activityStart and activityEnd messages at the appropriate times. An audioStreamEnd isn't sent in this configuration. Instead, any interruption of the stream is marked by an activityEnd message. Python config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : True }}, } async with client . aio . live . connect ( model = model , config = config ) as session : # ... await session . send_realtime_input ( activity_start = types . ActivityStart ()) await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) await session . send_realtime_input ( activity_end = types . ActivityEnd ()) # ... JavaScript const config = { responseModalities : [ Modality . TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : true , } } }; session . sendRealtimeInput ({ activityStart : {} }) session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); session . sendRealtimeInput ({ activityEnd : {} }) Token count You can find the total number of consumed tokens in the usageMetadata field of the returned server message. Python async for message in session . receive (): # The server will periodically send messages that include UsageMetadata. if message . usage_metadata : usage = message . usage_metadata print ( f "Used { usage . total_token_count } tokens in total. Response token \ No newline at end of file diff --git a/docstore/8647d57d-1cde-43b9-9235-2d2a1af794b4 b/docstore/8647d57d-1cde-43b9-9235-2d2a1af794b4 new file mode 100644 index 0000000000000000000000000000000000000000..39af00a0c9a9d748f5b17ca6db1ebff5b823541e --- /dev/null +++ b/docstore/8647d57d-1cde-43b9-9235-2d2a1af794b4 @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " Delete uploaded files Files are automatically deleted after 48 hours. You can also manually delete an uploaded file: Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) client . files . delete ( name = myfile . name ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; await ai . files . delete ({ name : fileName }); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } client . DeleteFile ( ctx , file . Name ) REST curl --request "DELETE" https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " Usage info You can use the Files API to upload and interact with media files. The Files API lets you store up to 20 GB of files per project, with a per-file maximum size of 2 GB. Files are stored for 48 hours. During that time, you can use the API to get metadata about the files, but you can't download the files. The Files API is available at no cost in all regions where the Gemini API is available. File prompting strategies This section provides guidance and best practices for using media files with prompts for the Gemini API. Being able to use various types of data in your prompts gives you more flexibility in terms of what tasks you can tackle with the Gemini API. For example, you can send the model a photo of a delicious meal and ask it to write a short blog about the meal. Prompt Response Write a short, engaging blog post based on this picture. It should include a description of the meal in the photo and talk about my journey meal prepping. Meal prepping is a great way to save time and money, and it can also help you to eat healthier. This meal is a great example of a healthy and delicious meal that can be easily prepped ahead of time. This \ No newline at end of file diff --git a/docstore/864e7e9c-d104-42b7-aee7-c2ad1ee867e2 b/docstore/864e7e9c-d104-42b7-aee7-c2ad1ee867e2 new file mode 100644 index 0000000000000000000000000000000000000000..696658722d20a37964165e2b0fc6d9de79b0d2b8 --- /dev/null +++ b/docstore/864e7e9c-d104-42b7-aee7-c2ad1ee867e2 @@ -0,0 +1 @@ +main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); You can enable transcription of the audio input by sending input_audio_transcription in setup config. Python import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ], "input_audio_transcription" : {}, } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_data = Path ( "16000.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_data , mime_type = 'audio/pcm;rate=16000' ) ) async for msg in session . receive (): if msg . server_content . input_transcription : print ( 'Transcript:' , msg . server_content . input_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ], inputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( \ No newline at end of file diff --git a/docstore/864f763a-5a11-436b-901e-8330ea1139d9 b/docstore/864f763a-5a11-436b-901e-8330ea1139d9 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/864f763a-5a11-436b-901e-8330ea1139d9 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/86511aa3-b176-4e01-b069-bde8310f7e89 b/docstore/86511aa3-b176-4e01-b069-bde8310f7e89 new file mode 100644 index 0000000000000000000000000000000000000000..f3a5c8d51af4fe74b88a61d9a283d0c7a963f683 --- /dev/null +++ b/docstore/86511aa3-b176-4e01-b069-bde8310f7e89 @@ -0,0 +1 @@ +workflows. The model also supports calling multiple functions in a single turn ( parallel function calling ) and in sequence ( compositional function calling ). Step 1: Define a function declaration Define a function and its declaration within your application code that allows users to set light values and make an API request. This function could call external services or APIs. Python # Define a function that the model can call to control smart lights set_light_values_declaration = { "name" : "set_light_values" , "description" : "Sets the brightness and color temperature of a light." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "integer" , "description" : "Light level from 0 to 100. Zero is off and 100 is full brightness" , }, "color_temp" : { "type" : "string" , "enum" : [ "daylight" , "cool" , "warm" ], "description" : "Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`." , }, }, "required" : [ "brightness" , "color_temp" ], }, } # This is the actual function that would be called based on the model's suggestion def set_light_values ( brightness : int , color_temp : str ) - > dict [ str , int | str ]: """Set the brightness and color temperature of a room light. (mock API). Args: brightness: Light level from 0 to 100. Zero is off and 100 is full brightness color_temp: Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. Returns: A dictionary containing the set brightness and color temperature. """ return { "brightness" : brightness , "colorTemperature" : color_temp } JavaScript import { Type } from '@google/genai' ; // Define a function that the model can call to control smart lights const setLightValuesFunctionDeclaration = { name : 'set_light_values' , description : 'Sets the brightness and color temperature of a light.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'Light level from 0 to 100. Zero is off \ No newline at end of file diff --git a/docstore/8656e542-0fc6-423a-bfbc-b74f1a0bb140 b/docstore/8656e542-0fc6-423a-bfbc-b74f1a0bb140 new file mode 100644 index 0000000000000000000000000000000000000000..e8472db3f1b37b3dc2d13dd06406d0e42fc75dfb --- /dev/null +++ b/docstore/8656e542-0fc6-423a-bfbc-b74f1a0bb140 @@ -0,0 +1 @@ +costing 258 tokens. Tips and best practices Verify that images are correctly rotated. Use clear, non-blurry images. When using a single image with text, place the text prompt after the image part in the contents array. What's next This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources: Files API : Learn more about uploading and managing files for use with Gemini. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/866ec1e4-b0a0-4930-98f9-693ba5352c28 b/docstore/866ec1e4-b0a0-4930-98f9-693ba5352c28 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/866ec1e4-b0a0-4930-98f9-693ba5352c28 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/868dd289-9d74-4313-b326-94485f421809 b/docstore/868dd289-9d74-4313-b326-94485f421809 new file mode 100644 index 0000000000000000000000000000000000000000..3a4b28b6b9ea1c7892a83325e4d8510d6a0b28de --- /dev/null +++ b/docstore/868dd289-9d74-4313-b326-94485f421809 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/system-instructions#system-instructions Title: Text generation | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/8694760c-97ac-44e6-b96e-7b54d42bea46 b/docstore/8694760c-97ac-44e6-b96e-7b54d42bea46 new file mode 100644 index 0000000000000000000000000000000000000000..4ce25c45c0956235a2a76b8ce578fbaaad6010c8 --- /dev/null +++ b/docstore/8694760c-97ac-44e6-b96e-7b54d42bea46 @@ -0,0 +1 @@ +clip" , ]), }); console . log ( response . text ); } await main (); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } defer client . DeleteFile ( ctx , file . Name ) model := client . GenerativeModel ( "gemini-2.0-flash" ) resp , err := model . GenerateContent ( ctx , genai . FileData { URI : file . URI }, genai . Text ( "Describe this audio clip" )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) REST AUDIO_PATH = "path/to/sample.mp3" MIME_TYPE = $( file -b --mime-type " ${ AUDIO_PATH } " ) NUM_BYTES = $( wc -c < " ${ AUDIO_PATH } " ) DISPLAY_NAME = AUDIO tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D " ${ tmp_header_file } " \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ AUDIO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Describe this audio clip"}, {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": \ No newline at end of file diff --git a/docstore/86d8874e-56a0-4e9e-804e-3e6a038b6cad b/docstore/86d8874e-56a0-4e9e-804e-3e6a038b6cad new file mode 100644 index 0000000000000000000000000000000000000000..fc6b27785ffc2bef06b5f2067401216e3ff34ff1 --- /dev/null +++ b/docstore/86d8874e-56a0-4e9e-804e-3e6a038b6cad @@ -0,0 +1 @@ +"cheeseburger" was excluded from the output because it wasn't a part of the order. Constraints Specify any constraints on reading the prompt or generating a response. You can tell the model what to do and not to do. For example, you can specify a constraint in the prompt on how long you want a summary to be: Prompt: Summarize this text in one sentence: Text: A quantum computer exploits quantum mechanical phenomena to perform calculations exponentially faster than any modern traditional computer. At very tiny scales, physical matter acts as both particles and as waves, and quantum computing uses specialized hardware to leverage this behavior. The operating principles of quantum devices is beyond the scope of classical physics. When deployed at scale, quantum computers could be used in a wide variety of applications such as: in cybersecurity to break existing encryption methods while helping researchers create new ones, in meteorology to develop better weather forecasting etc. However, the current state of the art quantum computers are still largely experimental and impractical. Response: Exploiting quantum mechanical phenomena, quantum computers can perform calculations exponentially faster than traditional computers for potential applications like cybersecurity and meteorology, although they are currently largely experimental and impractical. (gemini-2.5-flash) Response format You can give instructions that specify the format of the response. For example, you can ask for the response to be formatted as a table, bulleted list, elevator pitch, keywords, sentence, or paragraph. The following system instruction tells the model to be more conversational in its response: System instruction All questions should be answered comprehensively with details, unless the user requests a concise response specifically. Prompt What is a smart way to make a business that sells DVD's in 2025? Response: Starting a business selling DVDs in 2025 presents significant challenges due to the \ No newline at end of file diff --git a/docstore/8728626a-dee2-4079-acf5-f67f6875d969 b/docstore/8728626a-dee2-4079-acf5-f67f6875d969 new file mode 100644 index 0000000000000000000000000000000000000000..3f6e1470cdfa61b2bd7e98af2bbd2eef6f808d86 --- /dev/null +++ b/docstore/8728626a-dee2-4079-acf5-f67f6875d969 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/speech-generation#controllable Title: Speech generation (text-to-speech) | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/87504586-d5c3-4244-9577-d821105fa47b b/docstore/87504586-d5c3-4244-9577-d821105fa47b new file mode 100644 index 0000000000000000000000000000000000000000..045707d455060dfd20be0644c14272aa57ff277b --- /dev/null +++ b/docstore/87504586-d5c3-4244-9577-d821105fa47b @@ -0,0 +1 @@ +"log" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } prompt := "Explain the concept of Occam's Razor and provide a simple, everyday example." model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , genai . Text ( prompt ), nil ) fmt . Println ( resp . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain the concept of Occam\' s Razor and provide a simple, everyday example. " } ] } ] }' ``` Thinking budgets The thinkingBudget parameter guides the model on the number of thinking tokens to use when generating a response. A higher token count generally allows for more detailed reasoning, which can be beneficial for tackling more complex tasks . If latency is more important, use a lower budget or disable thinking by setting thinkingBudget to 0. Setting the thinkingBudget to -1 turns on dynamic thinking , meaning the model will adjust the budget based on the complexity of the request. The thinkingBudget is only supported in Gemini 2.5 Flash, 2.5 Pro, and 2.5 Flash-Lite. Depending on the prompt, the model might overflow or underflow the token budget. The following are thinkingBudget configuration details for each model type. Model Default setting (Thinking budget is not set) Range Disable thinking Turn on dynamic thinking 2.5 Pro Dynamic thinking: Model decides when and how much to think 128 to 32768 N/A: Cannot disable thinking thinkingBudget = -1 2.5 Flash Dynamic thinking: Model decides when and how much to think 0 to 24576 thinkingBudget = 0 thinkingBudget = -1 2.5 Flash Lite Model does not think 512 to 24576 thinkingBudget = 0 thinkingBudget = -1 Python from google import genai from google.genai import types client = genai . \ No newline at end of file diff --git a/docstore/8765a798-d39a-48d4-8728-1f22482f3e34 b/docstore/8765a798-d39a-48d4-8728-1f22482f3e34 new file mode 100644 index 0000000000000000000000000000000000000000..c6b31d98c2b16f02ca76b6157fd854c0b53727af --- /dev/null +++ b/docstore/8765a798-d39a-48d4-8728-1f22482f3e34 @@ -0,0 +1 @@ +: [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Thinking with Gemini 2.5 2.5 Flash and Pro models have "thinking" enabled by default to enhance quality, which may take longer to run and increase token usage. When using 2.5 Flash, you can disable thinking by setting the thinking budget to zero. For more details, see the thinking guide . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # Disables thinking ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , config : { thinkingConfig : { thinkingBudget : 0 , // Disables thinking }, } }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "How does AI work?" ), & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : int32 ( 0 ), // Disables thinking }, } ) \ No newline at end of file diff --git a/docstore/87678707-76da-4501-bfc2-fe282c6a8027 b/docstore/87678707-76da-4501-bfc2-fe282c6a8027 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/87678707-76da-4501-bfc2-fe282c6a8027 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/876f95a5-f72f-43a0-89fc-a668a207ae31 b/docstore/876f95a5-f72f-43a0-89fc-a668a207ae31 new file mode 100644 index 0000000000000000000000000000000000000000..a5a6b39c097d8cbbd04646d1c0a8361a10d2c9ae --- /dev/null +++ b/docstore/876f95a5-f72f-43a0-89fc-a668a207ae31 @@ -0,0 +1 @@ +meanings as statements, which means that a RAG system won't automatically recognize their relation. Task types enable you to generate optimized embeddings for specific tasks, saving you time and cost and improving performance. Python from google import genai from google.genai import types client = genai . Client () result = client . models . embed_content ( model = "gemini-embedding-exp-03-07" , contents = "What is the meaning of life?" , config = types . EmbedContentConfig ( task_type = "SEMANTIC_SIMILARITY" ) ) print ( result . embeddings ) JavaScript import { GoogleGenAI } from "@google/genai" ; async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . embedContent ({ model : 'gemini-embedding-exp-03-07' , contents : 'What is the meaning of life?' , config : { taskType : "SEMANTIC_SIMILARITY" , } }); console . log ( response . embeddings ); } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-embedding-exp-03-07:embedContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{"model": "models/gemini-embedding-exp-03-07", "content": { "parts":[{ "text": "What is the meaning of life?"}]}, "taskType": "SEMANTIC_SIMILARITY" }' Supported task types Task type Description SEMANTIC_SIMILARITY Used to generate embeddings that are optimized to assess text similarity. CLASSIFICATION Used to generate embeddings that are optimized to classify texts according to preset labels. CLUSTERING Used to generate embeddings that are optimized to cluster texts based on their similarities. RETRIEVAL_DOCUMENT , RETRIEVAL_QUERY , QUESTION_ANSWERING , and FACT_VERIFICATION Used to generate embeddings that are optimized for document search or information retrieval. CODE_RETRIEVAL_QUERY Used to retrieve a code block based on a natural language query, such as sort an array or reverse a linked list. Embeddings of the code blocks are computed using RETRIEVAL_DOCUMENT . Use cases Text embeddings \ No newline at end of file diff --git a/docstore/879b618f-7311-4954-bd05-b6c06ac9c69c b/docstore/879b618f-7311-4954-bd05-b6c06ac9c69c new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/879b618f-7311-4954-bd05-b6c06ac9c69c @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/87a54d52-60b6-48be-9cf8-f21cea63edad b/docstore/87a54d52-60b6-48be-9cf8-f21cea63edad new file mode 100644 index 0000000000000000000000000000000000000000..5389b5d9d1b7115f0b4483b4e1da807b20a5cfd5 --- /dev/null +++ b/docstore/87a54d52-60b6-48be-9cf8-f21cea63edad @@ -0,0 +1 @@ +such as object . properties (object): Lists individual parameters, each with: type (string): The data type of the parameter, such as string , integer , boolean, array . description (string): A description of the parameter's purpose and format. Provide examples and constraints ("The city and state, e.g., 'San Francisco, CA' or a zip code e.g., '95616'."). enum (array, optional): If the parameter values are from a fixed set, use "enum" to list the allowed values instead of just describing them in the description. This improves accuracy ("enum": ["daylight", "cool", "warm"]). required (array): An array of strings listing the parameter names that are mandatory for the function to operate. Function calling with thinking Enabling "thinking" can improve function call performance by allowing the model to reason through a request before suggesting function calls. However, because the Gemini API is stateless, this reasoning context is lost between turns, which can reduce the quality of function calls as they require multiple turn requests. To preserve this context you can use thought signatures. A thought signature is an encrypted representation of the model's internal thought process that you pass back to the model on subsequent turns. To use thought signatures: Receive the signature: When thinking is enabled, the API response will include a thought_signature field containing an encrypted representation of the model's reasoning. Return the signature: When you send the function's execution result back to the server, include the thought_signature you received. This allows the model to restore its previous thinking context and will likely result in better function calling performance. Receiving signatures from the server Signatures are returned in the part after the model's thinking phase, which typically is a text or function call. Here are some examples of what thought signatures look like returned in each type of part, in response to the request "What's the weather in Lake \ No newline at end of file diff --git a/docstore/87b4f4c2-fec2-4b73-a040-2b181e60a6aa b/docstore/87b4f4c2-fec2-4b73-a040-2b181e60a6aa new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/87b4f4c2-fec2-4b73-a040-2b181e60a6aa @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/87b78a51-08cc-4c3d-b91a-af71dfb6fbd6 b/docstore/87b78a51-08cc-4c3d-b91a-af71dfb6fbd6 new file mode 100644 index 0000000000000000000000000000000000000000..46dc106c387700742db50f2912cf28b003e737e9 --- /dev/null +++ b/docstore/87b78a51-08cc-4c3d-b91a-af71dfb6fbd6 @@ -0,0 +1 @@ +ClientConfig { APIKey : " YOUR_API_KEY " , Backend : genai . BackendGeminiAPI , }) if err != nil { log . Fatal ( err ) } result , err := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) if err != nil { log . Fatal ( err ) } fmt . Println ( result . Text ()) } Java package com.example ; import com.google.genai.Client ; import com.google.genai.types.GenerateContentResponse ; public class GenerateTextFromTextInput { public static void main ( String [] args ) { Client client = Client . builder (). apiKey ( " YOUR_API_KEY " ). build (); GenerateContentResponse response = client . models . generateContent ( "gemini-2.5-flash" , "Explain how AI works in a few words" , null ); System . out . println ( response . text ()); } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key= $ YOUR_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works in a few words" } ] } ] }' Keep your API key secure Treat your Gemini API key like a password. If compromised, others can use your project's quota, incur charges (if billing is enabled), and access your private data, such as files. Critical security rules Never commit API keys to source control. Do not check your API key into version control systems like Git. Never expose API keys on the client-side. Do not use your API key directly in web or mobile apps in production. Keys in client-side code (including our JavaScript/TypeScript libraries and REST calls) can be extracted. Best practices Use server-side calls with API keys The most secure way to use your API key is to call the Gemini API from a server-side application where the key can be kept confidential. Use ephemeral tokens for client-side access (Live API only): For direct client-side access to the Live API, you can use ephemeral tokens. They come with lower security risks and can be \ No newline at end of file diff --git a/docstore/87bd86e3-4335-4d10-8557-f6b331aff777 b/docstore/87bd86e3-4335-4d10-8557-f6b331aff777 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/87bd86e3-4335-4d10-8557-f6b331aff777 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/87d9e053-725b-4b32-a20c-6471562df457 b/docstore/87d9e053-725b-4b32-a20c-6471562df457 new file mode 100644 index 0000000000000000000000000000000000000000..cc5e2a23a1aa933103609ebe99004920f218cf78 --- /dev/null +++ b/docstore/87d9e053-725b-4b32-a20c-6471562df457 @@ -0,0 +1 @@ +"google.golang.org/genai" ) func main () { ctx := context . Background () client , _ := genai . NewClient ( ctx , & genai . ClientConfig { APIKey : os . Getenv ( "GEMINI_API_KEY" ), Backend : genai . BackendGeminiAPI , }) localPdfPath := "/path/to/file.pdf" uploadConfig := & genai . UploadFileConfig { MIMEType : "application/pdf" } uploadedFile , _ := client . Files . UploadFromPath ( ctx , localPdfPath , uploadConfig ) promptParts := [] * genai . Part { genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), genai . NewPartFromText ( "Give me a summary of this pdf file." ), } contents := [] * genai . Content { genai . NewContentFromParts ( promptParts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST NUM_BYTES = $( wc -c < " ${ PDF_PATH } " ) DISPLAY_NAME = TEXT tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files?key= ${ GEMINI_API_KEY } " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: application/pdf" \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ PDF_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl \ No newline at end of file diff --git a/docstore/87e6abbb-4936-4a75-a9b6-f1d4adf2000d b/docstore/87e6abbb-4936-4a75-a9b6-f1d4adf2000d new file mode 100644 index 0000000000000000000000000000000000000000..53e5ed0c4b3c9d5f8d129df24753928921198efa --- /dev/null +++ b/docstore/87e6abbb-4936-4a75-a9b6-f1d4adf2000d @@ -0,0 +1 @@ +text ); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -X POST \ -d '{ "contents": [ { "parts": [ {"text": "Who won the euro 2024?"} ] } ], "tools": [ { "google_search": {} } ] }' You can learn more by trying the Search tool notebook . How grounding with Google Search works When you enable the google_search tool, the model handles the entire workflow of searching, processing, and citing information automatically. User Prompt: Your application sends a user's prompt to the Gemini API with the google_search tool enabled. Prompt Analysis: The model analyzes the prompt and determines if a Google Search can improve the answer. Google Search: If needed, the model automatically generates one or multiple search queries and executes them. Search Results Processing: The model processes the search results, synthesizes the information, and formulates a response. Grounded Response: The API returns a final, user-friendly response that is grounded in the search results. This response includes the model's text answer and groundingMetadata with the search queries, web results, and citations. Understanding the Grounding Response When a response is successfully grounded, the response includes a groundingMetadata field. This structured data is essential for verifying claims and building a rich citation experience in your application. { "candidates" : [ { "content" : { "parts" : [ { "text" : "Spain won Euro 2024, defeating England 2-1 in the final. This victory marks Spain's record fourth European Championship title." } ], "role" : "model" }, "groundingMetadata" : { "webSearchQueries" : [ "UEFA Euro 2024 winner" , "who won euro 2024" ], "searchEntryPoint" : { "renderedContent" : "" }, "groundingChunks" : [ { "web" : { "uri" : "https://vertexaisearch.cloud.google.com....." , "title" : "aljazeera.com" }}, { "web" : \ No newline at end of file diff --git a/docstore/87f1aa0e-09a5-42c7-b885-626c2a17734a b/docstore/87f1aa0e-09a5-42c7-b885-626c2a17734a new file mode 100644 index 0000000000000000000000000000000000000000..e8472db3f1b37b3dc2d13dd06406d0e42fc75dfb --- /dev/null +++ b/docstore/87f1aa0e-09a5-42c7-b885-626c2a17734a @@ -0,0 +1 @@ +costing 258 tokens. Tips and best practices Verify that images are correctly rotated. Use clear, non-blurry images. When using a single image with text, place the text prompt after the image part in the contents array. What's next This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources: Files API : Learn more about uploading and managing files for use with Gemini. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/8814e34c-72f2-498a-83fe-dc86877f4589 b/docstore/8814e34c-72f2-498a-83fe-dc86877f4589 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/8814e34c-72f2-498a-83fe-dc86877f4589 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/883ca2b3-3b4c-4945-aa24-512dce382990 b/docstore/883ca2b3-3b4c-4945-aa24-512dce382990 new file mode 100644 index 0000000000000000000000000000000000000000..c72243f466a39b8c76a4237da0daa8b99ecbbe13 --- /dev/null +++ b/docstore/883ca2b3-3b4c-4945-aa24-512dce382990 @@ -0,0 +1 @@ +}, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST # Use a temporary file to hold the base64 encoded image data TEMP_B64 = $( mktemp ) trap 'rm -f "$TEMP_B64"' EXIT base64 $B64FLAGS $IMG_PATH > " $TEMP_B64 " # Use a temporary file to hold the JSON payload TEMP_JSON = $( mktemp ) trap 'rm -f "$TEMP_JSON"' EXIT cat > " $TEMP_JSON " << EOF { "contents" : [ { "parts" : [ { "text" : "Tell me about this instrument" } , { "inline_data" : { "mime_type" : "image/jpeg" , "data" : " $( cat " $TEMP_B64 " ) " } } ] } ] } EOF curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d "@ $TEMP_JSON " Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const imageUrl = 'http://image/url' ; const image = getImageData ( imageUrl ); const payload = { contents : [ { parts : [ { image }, { text : 'Tell me about this instrument' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } function getImageData ( url ) { const blob = UrlFetchApp . fetch ( url ). getBlob (); return { mimeType : blob . getContentType (), data : Utilities . base64Encode ( blob . getBytes ()) }; } \ No newline at end of file diff --git a/docstore/88481800-bff6-4fac-ab71-c2ac7a80e17c b/docstore/88481800-bff6-4fac-ab71-c2ac7a80e17c new file mode 100644 index 0000000000000000000000000000000000000000..cc967ffee72bfb803a27c22b45a7304a203928ac --- /dev/null +++ b/docstore/88481800-bff6-4fac-ab71-c2ac7a80e17c @@ -0,0 +1 @@ +the model to parse the problem before answering the prompt. Prompt Response Parse the formula in the image first. Then based on the formula, answer "what's the 4th term in the sequence?". Click to expand the result $$ b(n) = b(n-1) * (-3) $$ $$ b(1) = 15 $$ Substitute n = 4 into the formula: $$ b(4) = b(4-1) * (-3) $$ $$ b(4) = b(3) * (-3) $$ We don't know the value of b(3), so substitute n = 3 into the formula: $$ b(3) = b(3-1) * (-3) $$ $$ b(3) = b(2) * (-3) $$ We don't know the value of b(2), so substitute n = 2 into the formula: $$ b(2) = b(2-1) * (-3) $$ $$ b(2) = b(1) * (-3) $$ We know the value of b(1) is 15, so substitute it into the formula: $$ b(2) = 15 * (-3) $$ $$ b(2) = -45 $$ Now that we know the value of b(2), substitute it into the formula for b(3): $$ b(3) = b(2) * (-3) $$ $$ b(3) = -45 * (-3) $$ $$ b(3) = 135 $$ Now that we know the value of b(3), substitute it into the formula for b(4): $$ b(4) = b(3) * (-3) $$ $$ b(4) = 135 * (-3) $$ $$ b(4) = -405 $$ Therefore, the 4th term in the sequence is -405. This model response is correct - now to make the response even more streamlined, you can ask the model to to think step-by-step. Updated prompt Improved response What is the 4th term in the sequence? Think step by step. The first term is 15. The second term is b(1) * (-3) = 15 * (-3) = -45. The third term is b(2) * (-3) = -45 * (-3) = 135. The fourth term is b(3) * (-3) = 135 * (-3) = -405. Specify the output format A common problem is the need for model output to be in a certain format (e.g. markdown, JSON, HTML), especially if the model output needs to be ingested by a downstream task. You can try to produce output in that format by instructing the model to do so within the prompt. Below are two examples: Prompt Model response Parse the table in this image into markdown format | Subject | Status | Time | Start date | Due on | | ------- | ------ | ---- | ---------- | ------ | | Math | Done | 1.5 hrs | 11/29/2023 | 12/1/2023 | | Geography | Done | 30 \ No newline at end of file diff --git a/docstore/8883fcfb-dccc-4b3f-8696-809c36cfc16f b/docstore/8883fcfb-dccc-4b3f-8696-809c36cfc16f new file mode 100644 index 0000000000000000000000000000000000000000..10c56dda4e771cbe191acbd7eaea4d6ff44484f5 --- /dev/null +++ b/docstore/8883fcfb-dccc-4b3f-8696-809c36cfc16f @@ -0,0 +1 @@ +which you can get in Google AI Studio . base_url="https://generativelanguage.googleapis.com/v1beta/openai/" : This tells the OpenAI library to send requests to the Gemini API endpoint instead of the default URL. model="gemini-2.0-flash" : Choose a compatible Gemini model Thinking Gemini 2.5 models are trained to think through complex problems, leading to significantly improved reasoning. The Gemini API comes with a "thinking budget" parameter which gives fine grain control over how much the model will think. Unlike the Gemini API, the OpenAI API offers three levels of thinking control: "low" , "medium" , and "high" , which map to 1,024, 8,192, and 24,576 tokens, respectively. If you want to disable thinking, you can set reasoning_effort to "none" (note that reasoning cannot be turned off for 2.5 Pro models). Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.5-flash" , reasoning_effort = "low" , messages = [ { "role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Explain to me how AI works" } ] ) print ( response . choices [ 0 ] . message ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); const response = await openai . chat . completions . create ({ model : "gemini-2.5-flash" , reasoning_effort : "low" , messages : [ { role : "system" , content : "You are a helpful assistant." }, { role : "user" , content : "Explain to me how AI works" , }, ], }); console . log ( response . choices [ 0 ]. message ); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.5-flash", "reasoning_effort": "low", \ No newline at end of file diff --git a/docstore/8887f685-ec36-4775-a616-799fd411fc07 b/docstore/8887f685-ec36-4775-a616-799fd411fc07 new file mode 100644 index 0000000000000000000000000000000000000000..e83e80a217e9ff779ee42e75d11ac96b01a2f185 --- /dev/null +++ b/docstore/8887f685-ec36-4775-a616-799fd411fc07 @@ -0,0 +1 @@ +function_calling_config = types . FunctionCallingConfig ( mode = "ANY" , allowed_function_names = [ "get_current_temperature" ] ) ) # Create the generation config config = types . GenerateContentConfig ( tools = [ tools ], # not defined here. tool_config = tool_config , ) JavaScript import { FunctionCallingConfigMode } from '@google/genai' ; // Configure function calling mode const toolConfig = { functionCallingConfig : { mode : FunctionCallingConfigMode . ANY , allowedFunctionNames : [ 'get_current_temperature' ] } }; // Create the generation config const config = { tools : tools , // not defined here. toolConfig : toolConfig , }; Automatic function calling (Python only) When using the Python SDK, you can provide Python functions directly as tools. The SDK automatically converts the Python function to declarations, handles the function call execution and the response cycle for you. The Python SDK then automatically: Detects function call responses from the model. Call the corresponding Python function in your code. Sends the function response back to the model. Returns the model's final text response. To use this, define your function with type hints and a docstring, and then pass the function itself (not a JSON declaration) as a tool: Python from google import genai from google.genai import types # Define the function with type hints and docstring def get_current_temperature ( location : str ) - > dict : """Gets the current temperature for a given location. Args: location: The city and state, e.g. San Francisco, CA Returns: A dictionary containing the temperature and unit. """ # ... (implementation) ... return { "temperature" : 25 , "unit" : "Celsius" } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_current_temperature ] ) # Pass the function itself # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What's the temperature in Boston?" , config = config \ No newline at end of file diff --git a/docstore/889e8d2c-7597-46ac-a499-d39e2cf2b99a b/docstore/889e8d2c-7597-46ac-a499-d39e2cf2b99a new file mode 100644 index 0000000000000000000000000000000000000000..8c60a97b59d947e95247d6e4ee3eb21605ab2ae3 --- /dev/null +++ b/docstore/889e8d2c-7597-46ac-a499-d39e2cf2b99a @@ -0,0 +1 @@ +open ( video_file_name , 'rb' ) . read () response = client . models . generate_content ( model = 'models/gemini-2.0-flash' , contents = types . Content ( parts = [ types . Part ( inline_data = types . Blob ( data = video_bytes , mime_type = 'video/mp4' ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const base64VideoFile = fs . readFileSync ( "path/to/small-sample.mp4" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "video/mp4" , data : base64VideoFile , }, }, { text : "Please summarize the video in 3 sentences." } ]; const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : contents , }); console . log ( response . text ); REST Note: If you get an Argument list too long error, the base64 encoding of your file might be too long for the curl command line. Use the File API method instead for larger files. VIDEO_PATH = /path/to/your/video.mp4 if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"video/mp4", "data": "' $( base64 $B64FLAGS $VIDEO_PATH ) '" } }, {"text": "Please summarize the video in 3 sentences."} ] }] }' 2 > /dev/null Include a YouTube URL Preview: The YouTube URL feature is in preview and is available at no charge. Pricing and rate limits are likely to change. The Gemini API and AI Studio support YouTube URLs as a file data Part . You can include a YouTube URL with a prompt asking the model to summarize, translate, or otherwise interact with the video content. Limitations: For the free tier, you can't upload more than 8 hours of \ No newline at end of file diff --git a/docstore/88aef52b-fde4-4f77-b704-3df170195c0a b/docstore/88aef52b-fde4-4f77-b704-3df170195c0a new file mode 100644 index 0000000000000000000000000000000000000000..a3f65441166ab1d39d6f723c5ce4c04b54ac95e6 --- /dev/null +++ b/docstore/88aef52b-fde4-4f77-b704-3df170195c0a @@ -0,0 +1 @@ +values, use an enum. Tool Selection: While the model can use an arbitrary number of tools, providing too many can increase the risk of selecting an incorrect or suboptimal tool. For best results, aim to provide only the relevant tools for the context or task, ideally keeping the active set to a maximum of 10-20. Consider dynamic tool selection based on conversation context if you have a large total number of tools. Prompt Engineering: Provide context: Tell the model its role (e.g., "You are a helpful weather assistant."). Give instructions: Specify how and when to use functions (e.g., "Don't guess dates; always use a future date for forecasts."). Encourage clarification: Instruct the model to ask clarifying questions if needed. Temperature: Use a low temperature (e.g., 0) for more deterministic and reliable function calls. Validation: If a function call has significant consequences (e.g., placing an order), validate the call with the user before executing it. Error Handling : Implement robust error handling in your functions to gracefully handle unexpected inputs or API failures. Return informative error messages that the model can use to generate helpful responses to the user. Security: Be mindful of security when calling external APIs. Use appropriate authentication and authorization mechanisms. Avoid exposing sensitive data in function calls. Token Limits: Function descriptions and parameters count towards your input token limit. If you're hitting token limits, consider limiting the number of functions or the length of the descriptions, break down complex tasks into smaller, more focused function sets. Notes and limitations Only a subset of the OpenAPI schema is supported. Supported parameter types in Python are limited. Automatic function calling is a Python SDK feature only. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License \ No newline at end of file diff --git a/docstore/88c4a316-5024-4cbe-af92-2637724f9015 b/docstore/88c4a316-5024-4cbe-af92-2637724f9015 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/88c4a316-5024-4cbe-af92-2637724f9015 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/88c6c73d-4907-4346-9648-e1ccd5780f76 b/docstore/88c6c73d-4907-4346-9648-e1ccd5780f76 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/88c6c73d-4907-4346-9648-e1ccd5780f76 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/88c8714f-0c52-4826-b3bd-94fe88d695b7 b/docstore/88c8714f-0c52-4826-b3bd-94fe88d695b7 new file mode 100644 index 0000000000000000000000000000000000000000..6adba52776c85866a9f7e3e3060c0512ed2f447c --- /dev/null +++ b/docstore/88c8714f-0c52-4826-b3bd-94fe88d695b7 @@ -0,0 +1 @@ +about image prompting, see the Imagen prompt guide To learn about video prompting, see the Veo prompt guide Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-04-28 UTC. \ No newline at end of file diff --git a/docstore/88d03604-4fca-42c9-ba01-9b95aa2ab474 b/docstore/88d03604-4fca-42c9-ba01-9b95aa2ab474 new file mode 100644 index 0000000000000000000000000000000000000000..7c01a8e52c6f77d499351d1468f50bccd1328e77 --- /dev/null +++ b/docstore/88d03604-4fca-42c9-ba01-9b95aa2ab474 @@ -0,0 +1 @@ +function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); // Combine audio data strings and save as wave file const combinedAudio = turns . reduce (( acc , turn ) = > { if ( turn . data ) { const buffer = Buffer . from ( turn . data , 'base64' ); const intArray = new Int16Array ( buffer . buffer , buffer . byteOffset , buffer . byteLength / Int16Array . BYTES_PER_ELEMENT ); return acc . concat ( Array . from ( intArray )); } return acc ; }, []); const audioBuffer = new Int16Array ( combinedAudio ); const wf = new WaveFile (); wf . fromScratch ( 1 , 24000 , '16' , audioBuffer ); fs . writeFileSync ( 'output.wav' , wf . toBuffer ()); session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Audio formats Audio data in the Live API is always raw, little-endian, 16-bit PCM. Audio output always uses a sample rate of 24kHz. Input audio is natively 16kHz, but the Live API will resample if needed so any sample rate can be sent. To convey the sample rate of input audio, set the MIME type of each audio-containing Blob to a value like audio/pcm;rate=16000 . Audio transcriptions You can enable transcription of the model's audio output by sending output_audio_transcription in the setup config. The transcription language is inferred from the model's response. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ], "output_audio_transcription" : {} } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello? Gemini are you there?" await session . send_client_content ( turns = { \ No newline at end of file diff --git a/docstore/892df873-32b6-4e93-a43e-679b23489f2e b/docstore/892df873-32b6-4e93-a43e-679b23489f2e new file mode 100644 index 0000000000000000000000000000000000000000..90311e83465fc930174e1787a7822757d628fc0a --- /dev/null +++ b/docstore/892df873-32b6-4e93-a43e-679b23489f2e @@ -0,0 +1 @@ +setLightValues ( tool_call . args . brightness , tool_call . args . color_temp ); console . log ( `Function execution result: ${ JSON . stringify ( result ) } ` ); } Step 4: Create user friendly response with function result and call the model again Finally, send the result of the function execution back to the model so it can incorporate this information into its final response to the user. Python # Create a function response part function_response_part = types . Part . from_function_response ( name = tool_call . name , response = { "result" : result }, ) # Append function call and result of the function execution to contents contents . append ( response . candidates [ 0 ] . content ) # Append the content from the model's response. contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Create a function response part const function_response_part = { name : tool_call . name , response : { result } } // Append function call and result of the function execution to contents contents . push ( response . candidates [ 0 ]. content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); // Get the final response from the model const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); This completes the function calling flow. The model successfully used the set_light_values function to perform the request action of the user. Function declarations When you implement function calling in a prompt, you create a tools object, which contains one or more function declarations . You define functions using JSON, specifically with a select subset of the OpenAPI schema format. A single function \ No newline at end of file diff --git a/docstore/89362d44-f2a7-4f25-84e6-182264a3a041 b/docstore/89362d44-f2a7-4f25-84e6-182264a3a041 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/89362d44-f2a7-4f25-84e6-182264a3a041 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/8950dd7c-e5ae-402e-8b0f-581a596aaed5 b/docstore/8950dd7c-e5ae-402e-8b0f-581a596aaed5 new file mode 100644 index 0000000000000000000000000000000000000000..8ae055ee25ee10e0bf5368b2d5c01f7fd2abd6b2 --- /dev/null +++ b/docstore/8950dd7c-e5ae-402e-8b0f-581a596aaed5 @@ -0,0 +1 @@ +model can then retrieve content from the URLs and use that content to inform and shape its response. You can try examples of using tools with thinking models in the Thinking cookbook . What's next? To work through more in depth examples, like: Using tools with thinking Streaming with thinking Adjusting the thinking budget for different results and more, try our Thinking cookbook . Thinking coverage is now available in our OpenAI Compatibility guide. For more info about Gemini 2.5 Pro, Gemini Flash 2.5, and Gemini 2.5 Flash-Lite, visit the model page . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-11 UTC. \ No newline at end of file diff --git a/docstore/89526946-0a3f-4faa-b521-9ae907e1f3bc b/docstore/89526946-0a3f-4faa-b521-9ae907e1f3bc new file mode 100644 index 0000000000000000000000000000000000000000..a93f111d87cca3d0226fc4cdf96775703aa03950 --- /dev/null +++ b/docstore/89526946-0a3f-4faa-b521-9ae907e1f3bc @@ -0,0 +1 @@ +genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMAGE_PATH = "path/to/sample.jpg" MIME_TYPE = $( file -b --mime-type " ${ IMAGE_PATH } " ) NUM_BYTES = $( wc -c < " ${ IMAGE_PATH } " ) DISPLAY_NAME = IMAGE tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Caption this image."}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Prompting with multiple images You can provide multiple images in a \ No newline at end of file diff --git a/docstore/895aabe4-c568-49ab-b3c0-1671a67bd689 b/docstore/895aabe4-c568-49ab-b3c0-1671a67bd689 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/895aabe4-c568-49ab-b3c0-1671a67bd689 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/895f68d0-ea5d-47ad-acc4-fa5a52d45174 b/docstore/895f68d0-ea5d-47ad-acc4-fa5a52d45174 new file mode 100644 index 0000000000000000000000000000000000000000..b410c3d6fc95b8ad9abaf080d511bad548d2b4e0 --- /dev/null +++ b/docstore/895f68d0-ea5d-47ad-acc4-fa5a52d45174 @@ -0,0 +1 @@ +get all 50." ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" , tools : [{ codeExecution : {} }], }); const result = await model . generateContent ( "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and make sure you get " + "all 50." , ); console . log ( result . response . text ()); After Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'What is the sum of the first 50 prime numbers? Generate and run ' 'code for the calculation, and make sure you get all 50.' , config = types . GenerateContentConfig ( tools = [ types . Tool ( code_execution = types . ToolCodeExecution )], ), ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContent ({ model : "gemini-2.0-pro-exp-02-05" , contents : `Write and execute code that calculates the sum of the first 50 prime numbers. Ensure that only the executable code and its resulting output are generated.` , }); // Each part may contain text, executable code, or an execution result. for ( const part of response . candidates [ 0 ]. content . parts ) { console . log ( part ); console . log ( "\n" ); } console . log ( "-" . repeat ( 80 )); // The `.text` accessor concatenates the parts into a markdown-formatted text. console . log ( "\n" , response . text ); Search grounding GoogleSearch (Gemini>=2.0) and GoogleSearchRetrieval (Gemini < 2.0) are tools that allow the model to retrieve public web data for grounding, powered by Google. Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ( contents = "what is the \ No newline at end of file diff --git a/docstore/89665e9b-1607-4547-99ac-02f73ea2637d b/docstore/89665e9b-1607-4547-99ac-02f73ea2637d new file mode 100644 index 0000000000000000000000000000000000000000..e8472db3f1b37b3dc2d13dd06406d0e42fc75dfb --- /dev/null +++ b/docstore/89665e9b-1607-4547-99ac-02f73ea2637d @@ -0,0 +1 @@ +costing 258 tokens. Tips and best practices Verify that images are correctly rotated. Use clear, non-blurry images. When using a single image with text, place the text prompt after the image part in the contents array. What's next This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources: Files API : Learn more about uploading and managing files for use with Gemini. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/898859b7-6856-4e74-87ad-816975798fe8 b/docstore/898859b7-6856-4e74-87ad-816975798fe8 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/898859b7-6856-4e74-87ad-816975798fe8 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/898c9089-b8ba-48b2-8d2f-ba741f13038d b/docstore/898c9089-b8ba-48b2-8d2f-ba741f13038d new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/898c9089-b8ba-48b2-8d2f-ba741f13038d @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/89949e52-f452-4b8f-b848-4dfd9d677657 b/docstore/89949e52-f452-4b8f-b848-4dfd9d677657 new file mode 100644 index 0000000000000000000000000000000000000000..109bd0eaeef926ee02f7659dee842724a4ad9423 --- /dev/null +++ b/docstore/89949e52-f452-4b8f-b848-4dfd9d677657 @@ -0,0 +1 @@ +the libraries are stable and fully supported for production use. They are actively maintained, provide access to the latest features, and offer the best performance working with Gemini. If you're not using the Google GenAI SDK and using one of our legacy libraries, we strongly recommend you to migrate. Review the legacy libraries section for more information. Legacy libraries and migration If you are using one of our legacy libraries, we recommend that you migrate to the new libraries . The legacy libraries don't provide access to recent features (such as Live API and Veo ) and are on a deprecation path. They will stop receiving updates at the end of September 2025, the feature gaps will grow and potential bugs may no longer get fixed. Each legacy library's support status varies, detailed in the following table: Language Legacy library Support status Recommended library Python google-generativeai All support, including bug fixes, ends end of September 2025. google-genai JavaScript/TypeScript @google/generativeai All support, including bug fixes, ends end of September 2025. @google/genai Go google.golang.org/generative-ai All support, including bug fixes, ends end of September 2025. google.golang.org/genai Dart and Flutter google_generative_ai Not actively maintained Use trusted community or third party libraries, like firebase_ai , or access using REST API Swift generative-ai-swift Not actively maintained Use Gemini in Firebase Android generative-ai-android Not actively maintained Use Gemini in Firebase Note for Java developers: There was no legacy Google-provided Java SDK for the Gemini API, so no migration from a previous Google library is required. You can start directly with the new library in the Language support and installation section. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google \ No newline at end of file diff --git a/docstore/89a0a537-93aa-45f3-ac3d-eb136e7c0cb1 b/docstore/89a0a537-93aa-45f3-ac3d-eb136e7c0cb1 new file mode 100644 index 0000000000000000000000000000000000000000..a99dcaa8b13f3b83504ad1b35c796f8fc297615c --- /dev/null +++ b/docstore/89a0a537-93aa-45f3-ac3d-eb136e7c0cb1 @@ -0,0 +1 @@ +] # Execute the prompt with specified tools in audio modality await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Multiple tasks example - combining lights, code execution, and search const prompt = ` Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! ` ; const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } // not defined here. ]; // Execute the prompt with specified tools in audio modality await run ( prompt , { tools : tools , modality : "AUDIO" }); Python developers can try this out in the Live API Tool Use notebook . Model context protocol (MCP) Model Context Protocol (MCP) is an open standard for connecting AI applications with external tools and data. MCP provides a common protocol for models to access context, such as functions (tools), data sources (resources), or predefined prompts. The Gemini SDKs have built-in support for the MCP, reducing boilerplate code and offering automatic tool calling for MCP tools. When the model generates an MCP tool call, the Python and JavaScript client SDK can automatically execute the MCP tool and send the response back to the model in a subsequent request, continuing this loop until no more tool calls are made by the model. Here, you can find an example of how to use a local MCP server with Gemini and mcp SDK. Python Make sure the latest version of the mcp SDK is installed on your platform of choice. pip install mcp Note: Python supports automatic tool calling by passing in the ClientSession into the tools parameters. If you want to disable it, you can provide automatic_function_calling with disabled True . import os import asyncio from datetime import datetime from mcp import ClientSession , StdioServerParameters from \ No newline at end of file diff --git a/docstore/89c0b87c-d8ad-4558-90b0-ac86967bbf3f b/docstore/89c0b87c-d8ad-4558-90b0-ac86967bbf3f new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/89c0b87c-d8ad-4558-90b0-ac86967bbf3f @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/89d082d0-44d2-4d8d-8802-279de1727de6 b/docstore/89d082d0-44d2-4d8d-8802-279de1727de6 new file mode 100644 index 0000000000000000000000000000000000000000..1540f812946831d7975c774be202eaa83cf52504 --- /dev/null +++ b/docstore/89d082d0-44d2-4d8d-8802-279de1727de6 @@ -0,0 +1 @@ +"CiwBVKhc7nRyTi3HmggPD9iQiRc261f5jwuMdw3H/itDH0emsb9ZVo3Nwx9p6wpsAVSoXO5i8fDV4jBSBLoaWxB5zUdlGY6aIGp+I0oEnwRRSRQ1LOvrDlojEH8JE8HjiKXALdJrvNPiG+HY3GZEO8pZjEZtc3UoBUh7+SVyjK7Xolu7aRYYeUyzrCapoETWypER1jbrJXnFV23hCosBAVSoXO6oIPNJSmbuEDfGafOhuCSHkpr1yjTp35RXYqmCESzRzWf5+nFXLqncqeFo4ohoxbiYQVpVQbOZF81p8o9zg6xeRE7qMeOv+XN7enXGJ4/s3qNFQpfkSMqRdBITN1VpX7jyfEAjvxBNc7PDfDJZmEPY338ZIY5nFFcmzJSWjVrboFt2sMFv+A==" } ] , "role" : "model" } , "finishReason" : "STOP" , "index" : 0 } ] , # Remainder of response... You can confirm that you received a signature and see what a signature looks like using the following code: # Step 2: Call the model with function declarations # ...Generation config, Configure the client, and Define user prompt (No changes) # Send request with declarations (using a thinking model) response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents ) # See thought signatures for part in response . candidates [ 0 ] . content . parts : if part . thought_signature : print ( "Thought signature:" ) print ( part . thought_signature ) Returning signatures back to the server In order to return signatures back: You should return signatures along with their containing parts back to the server You shouldn't merge a part with a signature with another part which also contains a signature. The signature string is not concatenable You shouldn't merge one part with a signature with another part without a signature. This breaks the correct positioning of the thought represented by the signature. The code will remain the same as in Step 4 of the previous section. But in this case (as indicated in the comment below) you will return signatures to the model along with the result of the function execution so the model can incorporate the thoughts into its final response: Python # Step 4: Create user friendly response with function result and call the model again # ...Create a function response part (No change) # Append thought \ No newline at end of file diff --git a/docstore/89d9a6e0-3e00-4218-9cae-9dca018f889b b/docstore/89d9a6e0-3e00-4218-9cae-9dca018f889b new file mode 100644 index 0000000000000000000000000000000000000000..bdac38d149644927b7d126aee3930efd52a696eb --- /dev/null +++ b/docstore/89d9a6e0-3e00-4218-9cae-9dca018f889b @@ -0,0 +1 @@ +turn_off_the_lights_schema = { 'name' : 'turn_off_the_lights' } prompt = """ Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? """ tools = [ { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} ] await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Light control schemas const turnOnTheLightsSchema = { name : 'turn_on_the_lights' }; const turnOffTheLightsSchema = { name : 'turn_off_the_lights' }; const prompt = ` Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? ` ; const tools = [ { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } ]; await run ( prompt , tools = tools , modality = "AUDIO" ) Function calling modes The Gemini API lets you control how the model uses the provided tools (function declarations). Specifically, you can set the mode within the. function_calling_config . AUTO (Default) : The model decides whether to generate a natural language response or suggest a function call based on the prompt and context. This is the most flexible mode and recommended for most scenarios. ANY : The model is constrained to always predict a function call and guarantees function schema adherence. If allowed_function_names is not specified, the model can choose from any of the provided function declarations. If allowed_function_names is provided as a list, the model can only choose from the functions in that list. Use this mode when you require a function call response to every prompt (if applicable). NONE : The model is prohibited from making function calls. This is equivalent to sending a request without any function declarations. Use this to temporarily disable function calling without removing your tool definitions. Python from google.genai import types # Configure function calling mode tool_config = types . ToolConfig ( \ No newline at end of file diff --git a/docstore/89e9d1c2-0973-4ee9-b44c-9dacb714d4a4 b/docstore/89e9d1c2-0973-4ee9-b44c-9dacb714d4a4 new file mode 100644 index 0000000000000000000000000000000000000000..da790334610df2eb1eb6ac9bb30354d4f1eafe8b --- /dev/null +++ b/docstore/89e9d1c2-0973-4ee9-b44c-9dacb714d4a4 @@ -0,0 +1 @@ +the output you want. Adding these examples can help the model identify the patterns and apply the relationship between the given images and responses to the new example. This is also called "few-shot" learning. In the example below, the initial output is written in sentence form, and also contains the country (Brazil). Suppose you want the output in a different format or style, and you want only the city, not the country. Adding few-shot examples to your prompt can steer the model to respond in the way you want. Prompt Model response Determine the city along with the landmark. The landmark is the Christ the Redeemer statue in Rio de Janeiro, Brazil. Updated prompt Improved response Determine the city along with the landmark. city: Rome, landmark: the Colosseum. city: Beijing, landmark: Forbidden City city: Rio de Janeiro, landmark: Christ the Redeemer statue Break it down step-by-step For complex tasks like the ones that require both visual understanding and reasoning, it can be helpful to split the task into smaller, more straightforward steps. Alternatively, it could also be effective if you directly ask the model to “think step by step” in your prompt. Prompt Model response When will I run out of toilet paper? Soon, you only have 3 rolls left. Updated prompt Improved response 1. First, count how many toilet paper rolls are in this picture. 2. Then, determine how much toilet paper a typical person uses per day. 3. Calculate how long these rolls of toilet paper will last. 1. There are 3 rolls of toilet paper in this picture. 2. A typical person uses about 20 sheets of toilet paper per day. 3. If each roll contains 200 sheets, then each roll will last for about 10 days. Therefore, the 3 rolls will last for about a month. Math problems or other types of word problems are great candidates for asking the model to think step-by-step. Prompt Response What is the 4th term in the sequence? -135 The response from the model is incorrect. Some ways to improve this is to ask \ No newline at end of file diff --git a/docstore/89f79115-c3dc-4800-8c91-0d68c36ca2d1 b/docstore/89f79115-c3dc-4800-8c91-0d68c36ca2d1 new file mode 100644 index 0000000000000000000000000000000000000000..3c1b7a7f28e24faa230d65b7608a5733d7c64407 --- /dev/null +++ b/docstore/89f79115-c3dc-4800-8c91-0d68c36ca2d1 @@ -0,0 +1 @@ +single prompt by including multiple image Part objects in the contents array. These can be a mix of inline data (local files or URLs) and File API references. Python from google import genai from google.genai import types client = genai . Client () # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client . files . upload ( file = image1_path ) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open ( image2_path , 'rb' ) as f : img2_bytes = f . read () # Create the prompt with text and multiple images response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is different between these two images?" , uploaded_file , # Use the uploaded file reference types . Part . from_bytes ( data = img2_bytes , mime_type = 'image/png' ) ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); async function main () { // Upload the first image const image1_path = "path/to/image1.jpg" ; const uploadedFile = await ai . files . upload ({ file : image1_path , config : { mimeType : "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png" ; const base64Image2File = fs . readFileSync ( image2_path , { encoding : "base64" , }); // Create the prompt with text and multiple images const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ "What is different between these two images?" , createPartFromUri ( uploadedFile . uri , uploadedFile . mimeType ), { inlineData : { mimeType : "image/png" , data : base64Image2File , }, }, ]), }); console . log ( response . text ); } await main (); Go // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile , _ := client . Files . UploadFromPath ( ctx , image1Path , nil ) // Prepare the second image as inline \ No newline at end of file diff --git a/docstore/8a160b13-a795-4828-a3a5-20462057593c b/docstore/8a160b13-a795-4828-a3a5-20462057593c new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/8a160b13-a795-4828-a3a5-20462057593c @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/8a2a80cd-9704-4f65-8a32-6893762ebbdd b/docstore/8a2a80cd-9704-4f65-8a32-6893762ebbdd new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/8a2a80cd-9704-4f65-8a32-6893762ebbdd @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/8a314f19-0ed3-4e60-98b3-741230cf9c70 b/docstore/8a314f19-0ed3-4e60-98b3-741230cf9c70 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/8a314f19-0ed3-4e60-98b3-741230cf9c70 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/8a4d64d8-4e21-456f-8f16-a6387251f89f b/docstore/8a4d64d8-4e21-456f-8f16-a6387251f89f new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/8a4d64d8-4e21-456f-8f16-a6387251f89f @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/8a537e03-3281-42de-a4fc-d1b42bcc9837 b/docstore/8a537e03-3281-42de-a4fc-d1b42bcc9837 new file mode 100644 index 0000000000000000000000000000000000000000..ba9fb868f99e81b20779165b803150afffeabaec --- /dev/null +++ b/docstore/8a537e03-3281-42de-a4fc-d1b42bcc9837 @@ -0,0 +1 @@ +Generate video using Veo | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Generate video using Veo The Gemini API provides access to Veo 2 , Google's most capable video generation model to date. Veo generates videos in a wide range of cinematic and visual styles, capturing prompt nuance to render intricate details consistently across frames. This guide will help you get started with Veo using the Gemini API. For video prompting guidance, check out the Veo prompt guide section. Note: Veo is a paid feature and will not run in the Free tier. Visit the Pricing page for more details. Before you begin Before calling the Gemini API, ensure you have your SDK of choice installed, and a Gemini API key configured and ready to use. To use Veo with the Google Gen AI SDKs, ensure that you have one of the following versions installed: Python v1.10.0 or later TypeScript and JavaScript v0.8.0 or later Go v1.0.0 or later Generate videos This section provides code examples for generating videos using text prompts and using images . Generate from text You can use the following code to generate videos with Veo: Python import time from google import genai from google.genai import types client = genai . Client () operation = client . models . generate_videos ( model = "veo-2.0-generate-001" , prompt = "Panning wide shot of a calico kitten sleeping in the sunshine" , config = types . GenerateVideosConfig ( person_generation = "dont_allow" , # "dont_allow" or "allow_adult" aspect_ratio = "16:9" , # "16:9" or "9:16" ), ) while not operation . done : time . sleep ( 20 ) operation = client . operations . get ( operation ) for n , generated_video in enumerate ( \ No newline at end of file diff --git a/docstore/8aa6c458-a208-43d3-a4b0-763351ab0db4 b/docstore/8aa6c458-a208-43d3-a4b0-763351ab0db4 new file mode 100644 index 0000000000000000000000000000000000000000..2e8ac78d46cd543ec267658a40dc30e0a2feb077 --- /dev/null +++ b/docstore/8aa6c458-a208-43d3-a4b0-763351ab0db4 @@ -0,0 +1 @@ +Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { SystemInstruction : genai . NewContentFromText ( "You are a cat. Your name is Neko." , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Hello there" ), config , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "system_instruction": { "parts": [ { "text": "You are a cat. Your name is Neko." } ] }, "contents": [ { "parts": [ { "text": "Hello there" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const systemInstruction = { parts : [{ text : 'You are a cat. Your name is Neko.' }] }; const payload = { systemInstruction , contents : [ { parts : [ { text : 'Hello there' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } The GenerateContentConfig object also lets you override default generation parameters, such as temperature . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model \ No newline at end of file diff --git a/docstore/8acef599-a44f-4e04-aa4c-463d5371dab5 b/docstore/8acef599-a44f-4e04-aa4c-463d5371dab5 new file mode 100644 index 0000000000000000000000000000000000000000..a3f65441166ab1d39d6f723c5ce4c04b54ac95e6 --- /dev/null +++ b/docstore/8acef599-a44f-4e04-aa4c-463d5371dab5 @@ -0,0 +1 @@ +values, use an enum. Tool Selection: While the model can use an arbitrary number of tools, providing too many can increase the risk of selecting an incorrect or suboptimal tool. For best results, aim to provide only the relevant tools for the context or task, ideally keeping the active set to a maximum of 10-20. Consider dynamic tool selection based on conversation context if you have a large total number of tools. Prompt Engineering: Provide context: Tell the model its role (e.g., "You are a helpful weather assistant."). Give instructions: Specify how and when to use functions (e.g., "Don't guess dates; always use a future date for forecasts."). Encourage clarification: Instruct the model to ask clarifying questions if needed. Temperature: Use a low temperature (e.g., 0) for more deterministic and reliable function calls. Validation: If a function call has significant consequences (e.g., placing an order), validate the call with the user before executing it. Error Handling : Implement robust error handling in your functions to gracefully handle unexpected inputs or API failures. Return informative error messages that the model can use to generate helpful responses to the user. Security: Be mindful of security when calling external APIs. Use appropriate authentication and authorization mechanisms. Avoid exposing sensitive data in function calls. Token Limits: Function descriptions and parameters count towards your input token limit. If you're hitting token limits, consider limiting the number of functions or the length of the descriptions, break down complex tasks into smaller, more focused function sets. Notes and limitations Only a subset of the OpenAPI schema is supported. Supported parameter types in Python are limited. Automatic function calling is a Python SDK feature only. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License \ No newline at end of file diff --git a/docstore/8adc2e61-f023-40fd-917d-4e294e863197 b/docstore/8adc2e61-f023-40fd-917d-4e294e863197 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/8adc2e61-f023-40fd-917d-4e294e863197 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/8ae9208c-274d-49f3-a152-238660d44f04 b/docstore/8ae9208c-274d-49f3-a152-238660d44f04 new file mode 100644 index 0000000000000000000000000000000000000000..2d31d5628bca7cb00e5e08ad87458576b80881c1 --- /dev/null +++ b/docstore/8ae9208c-274d-49f3-a152-238660d44f04 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/image-understanding#object-detection Title: Image understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/8b29254d-242d-489f-916b-0fff2ab81146 b/docstore/8b29254d-242d-489f-916b-0fff2ab81146 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/8b29254d-242d-489f-916b-0fff2ab81146 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/8b3170ec-37a1-4479-a0d0-ef968ccefa56 b/docstore/8b3170ec-37a1-4479-a0d0-ef968ccefa56 new file mode 100644 index 0000000000000000000000000000000000000000..03e11f1b3b2cd84e15c6098d543ad30ece4e0a72 --- /dev/null +++ b/docstore/8b3170ec-37a1-4479-a0d0-ef968ccefa56 @@ -0,0 +1 @@ +Friday." }, ], response_format = CalendarEvent , ) print ( completion . choices [ 0 ] . message . parsed ) JavaScript import OpenAI from "openai" ; import { zodResponseFormat } from "openai/helpers/zod" ; import { z } from "zod" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai" }); const CalendarEvent = z . object ({ name : z . string (), date : z . string (), participants : z . array ( z . string ()), }); const completion = await openai . beta . chat . completions . parse ({ model : "gemini-2.0-flash" , messages : [ { role : "system" , content : "Extract the event information." }, { role : "user" , content : "John and Susan are going to an AI conference on Friday" }, ], response_format : zodResponseFormat ( CalendarEvent , "event" ), }); const event = completion . choices [ 0 ]. message . parsed ; console . log ( event ); Embeddings Text embeddings measure the relatedness of text strings and can be generated using the Gemini API . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . embeddings . create ( input = "Your text string goes here" , model = "text-embedding-004" ) print ( response . data [ 0 ] . embedding ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function main () { const embedding = await openai . embeddings . create ({ model : "text-embedding-004" , input : "Your text string goes here" , }); console . log ( embedding ); } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/embeddings" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "input": "Your text string goes here", "model": "text-embedding-004" }' extra_body There are several features supported by Gemini that \ No newline at end of file diff --git a/docstore/8b4b14ac-c03f-40ca-9522-9b4d92bd3703 b/docstore/8b4b14ac-c03f-40ca-9522-9b4d92bd3703 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/8b4b14ac-c03f-40ca-9522-9b4d92bd3703 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/8b6135bb-ecef-4562-9e44-823102294f67 b/docstore/8b6135bb-ecef-4562-9e44-823102294f67 new file mode 100644 index 0000000000000000000000000000000000000000..76ace95fe58e8b11b8d859b0314962fe72bedd09 --- /dev/null +++ b/docstore/8b6135bb-ecef-4562-9e44-823102294f67 @@ -0,0 +1 @@ +() { const payload = { contents : [ { parts : [ { text : 'Explain how AI works' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Multi-turn conversations (Chat) Our SDKs provide functionality to collect multiple rounds of prompts and responses into a chat, giving you an easy way to keep track of the conversation history. Note: Chat functionality is only implemented as part of the SDKs. Behind the scenes, it still uses the generateContent API. For multi-turn conversations, the full conversation history is sent to the model with each follow-up turn. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message ( "I have 2 dogs in my house." ) print ( response . text ) response = chat . send_message ( "How many paws are in my house?" ) print ( response . text ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const response1 = await chat . sendMessage ({ message : "I have 2 dogs in my house." , }); console . log ( "Chat response 1:" , response1 . text ); const response2 = await chat . sendMessage ({ message : "How many paws are in \ No newline at end of file diff --git a/docstore/8b62f76d-58b4-4137-82d1-b597221555d7 b/docstore/8b62f76d-58b4-4137-82d1-b597221555d7 new file mode 100644 index 0000000000000000000000000000000000000000..3e0dca132b5cee05a4316835e4f2b62d82d3c7fe --- /dev/null +++ b/docstore/8b62f76d-58b4-4137-82d1-b597221555d7 @@ -0,0 +1 @@ +Limits: Function descriptions and parameters count towards your input token limit. If you're hitting token limits, consider limiting the number of functions or the length of the descriptions, break down complex tasks into smaller, more focused function sets. Notes and limitations Only a subset of the OpenAPI schema is supported. Supported parameter types in Python are limited. Automatic function calling is a Python SDK feature only. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC. \ No newline at end of file diff --git a/docstore/8b7b1007-639c-4aa2-89f1-4c8678ccdae5 b/docstore/8b7b1007-639c-4aa2-89f1-4c8678ccdae5 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/8b7b1007-639c-4aa2-89f1-4c8678ccdae5 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/8b8aeb90-2af7-488d-857f-1cbe75728fdc b/docstore/8b8aeb90-2af7-488d-857f-1cbe75728fdc new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/8b8aeb90-2af7-488d-857f-1cbe75728fdc @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/8b8dd97c-b706-4331-ae0d-2c529cce9885 b/docstore/8b8dd97c-b706-4331-ae0d-2c529cce9885 new file mode 100644 index 0000000000000000000000000000000000000000..d276bcbbbbd8ffd587f83aebc6d230d4e5c5b078 --- /dev/null +++ b/docstore/8b8dd97c-b706-4331-ae0d-2c529cce9885 @@ -0,0 +1 @@ +Gemini Developer API Pricing | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Gemini Developer API Pricing The Gemini API "free tier" is offered through the API service with lower rate limits for testing purposes. Google AI Studio usage is completely free in all available countries. The Gemini API "paid tier" comes with higher rate limits , additional features, and different data handling. Upgrade to the Paid Tier If you're looking to reduce costs and your use case doesn't require immediate real-time responses, check out Batch Mode . Batch Mode is designed to process large volumes of requests asynchronously. Requests submitted using this mode is 50% of the price of interactive (non-batch mode) requests. Gemini 2.5 Pro Try it in Google AI Studio Our state-of-the-art multipurpose model, which excels at coding and complex reasoning tasks. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge $1.25, prompts <= 200k tokens $2.50, prompts > 200k tokens Output price (including thinking tokens) Free of charge $10.00, prompts <= 200k tokens $15.00, prompts > 200k Context caching price Not available $0.31, prompts <= 200k tokens $0.625, prompts > 200k $4.50 / 1,000,000 tokens per hour (storage price) Grounding with Google Search Not available 1,500 RPD (free), then $35 / 1,000 requests Used to improve our products Yes No Gemini 2.5 Flash Try it in Google AI Studio Our first hybrid reasoning model which supports a 1M token context window and has thinking budgets. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge $0.30 (text / image / video) $1.00 (audio) Output price (including thinking tokens) Free of charge $2.50 Context \ No newline at end of file diff --git a/docstore/8b92e934-3d75-4e53-ac2a-b3696e99f366 b/docstore/8b92e934-3d75-4e53-ac2a-b3696e99f366 new file mode 100644 index 0000000000000000000000000000000000000000..c1215478fcfc0a791e03023378a8264733510dd8 --- /dev/null +++ b/docstore/8b92e934-3d75-4e53-ac2a-b3696e99f366 @@ -0,0 +1 @@ +connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.pcm" ); const base64Audio = Buffer . from ( fileBuffer ). toString ( 'base64' ); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); // if stream gets paused, send: // session.sendRealtimeInput({ audioStreamEnd: true }) const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); With send_realtime_input , the API will respond to audio automatically based on VAD. While send_client_content adds messages to the model context in order, send_realtime_input is optimized for responsiveness at the expense of deterministic ordering. Automatic VAD configuration For more control over the VAD activity, you can configure the following parameters. See API reference for more info. Python from google.genai import types config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : False , # default "start_of_speech_sensitivity" : types . StartSensitivity . START_SENSITIVITY_LOW , "end_of_speech_sensitivity" : types . EndSensitivity . END_SENSITIVITY_LOW , "prefix_padding_ms" : 20 , "silence_duration_ms" : 100 , } } } JavaScript import { GoogleGenAI , Modality , StartSensitivity , EndSensitivity } from '@google/genai' ; const config = { responseModalities : [ Modality . \ No newline at end of file diff --git a/docstore/8bc81bec-a897-4eb1-bd87-f20a25b15bf0 b/docstore/8bc81bec-a897-4eb1-bd87-f20a25b15bf0 new file mode 100644 index 0000000000000000000000000000000000000000..122c682d2774097387ed4735af082d43f98d76f5 --- /dev/null +++ b/docstore/8bc81bec-a897-4eb1-bd87-f20a25b15bf0 @@ -0,0 +1 @@ +regions. Can I use 1M tokens in the free tier? The free tier for Gemini API differs based on the model selected. For now, you can try the 1M token context window in the following ways: In Google AI Studio With pay-as-you-go plans With free-of-charge plans for select models See the latest free-of-charge rate limits per model on rate limits page . How can I calculate the number of tokens I'm using? Use the GenerativeModel.count_tokens method to count the number of tokens. Refer to the Tokens guide to learn more about tokens. Can I use my Google Cloud credits with the Gemini API? Yes, Google Cloud credits can be used towards Gemini API usage. How is billing handled? Billing for the Gemini API is handled by the Cloud Billing system. Am I charged for failed requests? If your request fails with a 400 or 500 error, you won't be charged for the tokens used. However, the request will still count against your quota. Is there a charge for fine-tuning the models? Model tuning is free, but inference on tuned models is charged at the same rate as the base models. Is GetTokens billed? Requests to the GetTokens API are not billed, and they don't count against inference quota. How is my Google AI Studio data handled if I have a paid API account? Refer to the terms for details on how data is handled when Cloud billing is enabled (see "How Google Uses Your Data" under "Paid Services"). Note that your Google AI Studio prompts are treated under the same "Paid Services" terms so long as at least 1 API project has billing enabled, which you can validate on the Gemini API Key page if you see any projects marked as "Paid" under "Plan". Where can I get help with billing? To get help with billing, see Get Cloud Billing support . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered \ No newline at end of file diff --git a/docstore/8bcc08a9-a3b2-4c2c-838e-98d03bbcf729 b/docstore/8bcc08a9-a3b2-4c2c-838e-98d03bbcf729 new file mode 100644 index 0000000000000000000000000000000000000000..47c113cd9bacf84a434d531b6a240f0025b74130 --- /dev/null +++ b/docstore/8bcc08a9-a3b2-4c2c-838e-98d03bbcf729 @@ -0,0 +1 @@ +. For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC. \ No newline at end of file diff --git a/docstore/8bdfc919-320d-43b6-bb03-d813dab815d4 b/docstore/8bdfc919-320d-43b6-bb03-d813dab815d4 new file mode 100644 index 0000000000000000000000000000000000000000..4b5f15989e784aa4b4f5462e86ee08ece0d0f480 --- /dev/null +++ b/docstore/8bdfc919-320d-43b6-bb03-d813dab815d4 @@ -0,0 +1 @@ +pathlib . Path ( 'a11.txt' ) . write_text ( response . text ) my_file = client . files . upload ( file = 'a11.txt' ) response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = [ 'Can you summarize this file:' , my_file ] ) print ( response . text ) List and get List uploaded files and get an uploaded file with a filename: Before Python import google.generativeai as genai for file in genai . list_files (): print ( file . name ) file = genai . get_file ( name = file . name ) After Python from google import genai client = genai . Client () for file in client . files . list (): print ( file . name ) file = client . files . get ( name = file . name ) Delete Delete a file: Before Python import pathlib import google.generativeai as genai pathlib . Path ( 'dummy.txt' ) . write_text ( dummy ) dummy_file = genai . upload_file ( path = 'dummy.txt' ) file = genai . delete_file ( name = dummy_file . name ) After Python import pathlib from google import genai client = genai . Client () pathlib . Path ( 'dummy.txt' ) . write_text ( dummy ) dummy_file = client . files . upload ( file = 'dummy.txt' ) response = client . files . delete ( name = dummy_file . name ) Context caching Context caching allows the user to pass the content to the model once, cache the input tokens, and then refer to the cached tokens in subsequent calls to lower the cost. Before Python import requests import pathlib import google.generativeai as genai from google.generativeai import caching # Download file response = requests . get ( 'https://storage.googleapis.com/generativeai-downloads/data/a11.txt' ) pathlib . Path ( 'a11.txt' ) . write_text ( response . text ) # Upload file document = genai . upload_file ( path = "a11.txt" ) # Create cache apollo_cache = caching . CachedContent . create ( model = "gemini-1.5-flash-001" , system_instruction = "You are an expert at analyzing transcripts." , contents = [ document ], ) # Generate response apollo_model = genai . GenerativeModel . \ No newline at end of file diff --git a/docstore/8be354c3-0c2c-481e-a775-6aea48e6b544 b/docstore/8be354c3-0c2c-481e-a775-6aea48e6b544 new file mode 100644 index 0000000000000000000000000000000000000000..13654164ec6e7048af241ec2efbc514257c312b5 --- /dev/null +++ b/docstore/8be354c3-0c2c-481e-a775-6aea48e6b544 @@ -0,0 +1 @@ +For alternative methods of providing images and more advanced image processing, see our image understanding guide . The API also supports document , video , and audio inputs and understanding. Streaming responses By default, the model returns a response only after the entire generation process is complete. For more fluid interactions, use streaming to receive GenerateContentResponse instances incrementally as they're generated. Python from google import genai client = genai . Client () response = client . models . generate_content_stream ( model = "gemini-2.5-flash" , contents = [ "Explain how AI works" ] ) for chunk in response : print ( chunk . text , end = "" ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , }); for await ( const chunk of response ) { console . log ( chunk . text ); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } stream := client . Models . GenerateContentStream ( ctx , "gemini-2.5-flash" , genai . Text ( "Write a story about a magic backpack." ), nil , ) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt=sse" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ --no-buffer \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main \ No newline at end of file diff --git a/docstore/8be3ce9d-ab8c-4900-a9b0-8f0dd52e19d7 b/docstore/8be3ce9d-ab8c-4900-a9b0-8f0dd52e19d7 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/8be3ce9d-ab8c-4900-a9b0-8f0dd52e19d7 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/8be57229-423d-4d36-aebd-68ad8bd7659c b/docstore/8be57229-423d-4d36-aebd-68ad8bd7659c new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/8be57229-423d-4d36-aebd-68ad8bd7659c @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/8be6e001-68aa-437d-96b9-b617c68b88c4 b/docstore/8be6e001-68aa-437d-96b9-b617c68b88c4 new file mode 100644 index 0000000000000000000000000000000000000000..eb233fee8099f5f789dde3693dc446d13c990aff --- /dev/null +++ b/docstore/8be6e001-68aa-437d-96b9-b617c68b88c4 @@ -0,0 +1 @@ +UploadFile ( ... ) After (Centralized Client Object) Python from google import genai # Create a single client object client = genai . Client () # Access API methods through services on the client object response = client . models . generate_content ( ... ) chat = client . chats . create ( ... ) my_file = client . files . upload ( ... ) tuning_job = client . tunings . tune ( ... ) JavaScript import { GoogleGenAI } from "@google/genai" ; // Create a single client object const ai = new GoogleGenAI ({ apiKey : "YOUR_API_KEY" }); // Access API methods through services on the client object const response = await ai . models . generateContent (...); const chat = ai . chats . create (...); const uploadedFile = await ai . files . upload (...); const cache = await ai . caches . create (...); Go import "google.golang.org/genai" // Create a single client object client , err := genai . NewClient ( ctx , nil ) // Access API methods through services on the client object result , err := client . Models . GenerateContent ( ... ) chat , err := client . Chats . Create ( ... ) uploadedFile , err := client . Files . Upload ( ... ) tuningJob , err := client . Tunings . Tune ( ... ) Authentication Both legacy and new libraries authenticate using API keys. You can create your API key in Google AI Studio. Before Python The old SDK handled the API client object implicitly. import google.generativeai as genai genai . configure ( api_key =... ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); Go Import the Google libraries: import ( "github.com/google/generative-ai-go/genai" "google.golang.org/api/option" ) Create the client: client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) After Python With Google GenAI SDK, you create an API client first, which is used to call the API. The new SDK will pick up your API key from either one of the GEMINI_API_KEY or GOOGLE_API_KEY environment \ No newline at end of file diff --git a/docstore/8becf36d-97ba-44ac-a86a-8e38bc70eed4 b/docstore/8becf36d-97ba-44ac-a86a-8e38bc70eed4 new file mode 100644 index 0000000000000000000000000000000000000000..fc2e63c0fbe73768c251346fd94b01e59289b693 --- /dev/null +++ b/docstore/8becf36d-97ba-44ac-a86a-8e38bc70eed4 @@ -0,0 +1 @@ +Additional usage policies | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Additional usage policies This page includes additional usage policies for the Gemini API. Abuse monitoring Google is committed to the responsible development and use of AI. To ensure the safety and integrity of the Gemini API, we have created these policy guidelines. By using the Gemini API, you agree to the following guidelines, the Gemini API Additional Terms of Service and Generative AI Prohibited Use Policy . How We Monitor for Misuse Google's Trust and Safety Team employs a combination of automated and manual processes to detect potential misuse of the Gemini API and enforce our policies. Automated Detection: Automated systems scan API usage for violations of our Prohibited Use Policy, such as hate speech, harassment, sexually explicit content, and dangerous content. Manual Detection: If a project consistently exhibits suspicious activity, it may be flagged for manual review by authorized Google personnel. How We Handle Data To help with abuse monitoring, Google retains the following data for fifty-five (55) days: Prompts: The text prompts you submit to the API. Contextual Information: Any additional context you provide with your prompts. Output: The responses generated by the Gemini API. How We Investigate Potential Issues When prompts or model outputs are flagged by safety filters and abuse detection systems described above, authorized Google employees may assess the flagged content, and either confirm or correct the classification or determination based on predefined guidelines and policies. Data can be accessed for human review only by authorized \ No newline at end of file diff --git a/docstore/8bf55844-2d0f-4dc9-a78d-0d1128ff75f9 b/docstore/8bf55844-2d0f-4dc9-a78d-0d1128ff75f9 new file mode 100644 index 0000000000000000000000000000000000000000..ed09bf86b4b3896290a2372bddef4006c085c60d --- /dev/null +++ b/docstore/8bf55844-2d0f-4dc9-a78d-0d1128ff75f9 @@ -0,0 +1 @@ +Image generation | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image generation You can generate images using the Gemini API with either Gemini's built-in multimodal capabilities or Imagen, Google's specialized image generation models. For most use cases, start with Gemini . Choose Imagen for specialized tasks where image quality is critical. See Choosing the right model section for more guidance. All generated images include a SynthID watermark . Before you begin Ensure you use a supported model and version for image generation: For Gemini , use Gemini 2.0 Flash Preview Image Generation. For Imagen , use one of the Imagen models (Imagen 3, Imagen 4 or Imagen 4 Ultra). Note that those models are only available on the Paid tier . You can access both Gemini and Imagen models using the same libraries. Note: Image generation may not be available in all regions and countries, review our Models page for more information. Generate images using Gemini Gemini can generate and process images conversationally. You can prompt Gemini with text, images, or a combination of both to achieve various image-related tasks, such as image generation and editing. You must include responseModalities : ["TEXT", "IMAGE"] in your configuration. Image-only output is not supported with these models. Image generation (text-to-image) The following code demonstrates how to generate an image based on a descriptive prompt: Python from google import genai from google.genai import types from PIL import Image from io import BytesIO import base64 client = genai . Client () contents = ( 'Hi, can you create a 3d rendered image of a pig ' 'with wings and a top hat flying \ No newline at end of file diff --git a/docstore/8c343c3e-6ff2-4291-9e1f-1c715c4226e7 b/docstore/8c343c3e-6ff2-4291-9e1f-1c715c4226e7 new file mode 100644 index 0000000000000000000000000000000000000000..90311e83465fc930174e1787a7822757d628fc0a --- /dev/null +++ b/docstore/8c343c3e-6ff2-4291-9e1f-1c715c4226e7 @@ -0,0 +1 @@ +setLightValues ( tool_call . args . brightness , tool_call . args . color_temp ); console . log ( `Function execution result: ${ JSON . stringify ( result ) } ` ); } Step 4: Create user friendly response with function result and call the model again Finally, send the result of the function execution back to the model so it can incorporate this information into its final response to the user. Python # Create a function response part function_response_part = types . Part . from_function_response ( name = tool_call . name , response = { "result" : result }, ) # Append function call and result of the function execution to contents contents . append ( response . candidates [ 0 ] . content ) # Append the content from the model's response. contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Create a function response part const function_response_part = { name : tool_call . name , response : { result } } // Append function call and result of the function execution to contents contents . push ( response . candidates [ 0 ]. content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); // Get the final response from the model const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); This completes the function calling flow. The model successfully used the set_light_values function to perform the request action of the user. Function declarations When you implement function calling in a prompt, you create a tools object, which contains one or more function declarations . You define functions using JSON, specifically with a select subset of the OpenAPI schema format. A single function \ No newline at end of file diff --git a/docstore/8c5a6e56-611c-4492-ad61-9660bac00576 b/docstore/8c5a6e56-611c-4492-ad61-9660bac00576 new file mode 100644 index 0000000000000000000000000000000000000000..155726b0c29beaa4b366bda3c7400e5b8cd47772 --- /dev/null +++ b/docstore/8c5a6e56-611c-4492-ad61-9660bac00576 @@ -0,0 +1 @@ +context that you want to re-use many times, context caching can help reduce the costs associated with asking questions about that information. Does the context length affect the model latency? There is some fixed amount of latency in any given request, regardless of the size, but generally longer queries will have higher latency (time to first token). Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-05-20 UTC. \ No newline at end of file diff --git a/docstore/8c5c9448-94db-40bb-93b1-430b3db27606 b/docstore/8c5c9448-94db-40bb-93b1-430b3db27606 new file mode 100644 index 0000000000000000000000000000000000000000..8b1657bd0e8a588bad85661db832fbb0d660fdd6 --- /dev/null +++ b/docstore/8c5c9448-94db-40bb-93b1-430b3db27606 @@ -0,0 +1 @@ +serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Incremental content updates Use incremental updates to send text input, establish session context, or restore session context. For short contexts you can send turn-by-turn interactions to represent the exact sequence of events: Python turns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }]}, { "role" : "model" , "parts" : [{ "text" : "Paris" }]}, ] await session . send_client_content ( turns = turns , turn_complete = False ) turns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }]}] await session . send_client_content ( turns = turns , turn_complete = True ) JavaScript let inputTurns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }] }, { "role" : "model" , "parts" : [{ "text" : "Paris" }] }, ] session . sendClientContent ({ turns : inputTurns , turnComplete : false }) inputTurns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }] }] session . sendClientContent ({ turns : inputTurns , turnComplete : true }) For longer contexts \ No newline at end of file diff --git a/docstore/8c5f778e-1bf8-47ed-a049-cb76c7adcf36 b/docstore/8c5f778e-1bf8-47ed-a049-cb76c7adcf36 new file mode 100644 index 0000000000000000000000000000000000000000..6adba52776c85866a9f7e3e3060c0512ed2f447c --- /dev/null +++ b/docstore/8c5f778e-1bf8-47ed-a049-cb76c7adcf36 @@ -0,0 +1 @@ +about image prompting, see the Imagen prompt guide To learn about video prompting, see the Veo prompt guide Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-04-28 UTC. \ No newline at end of file diff --git a/docstore/8c62fe0d-712d-4666-b45d-30801f09ec9e b/docstore/8c62fe0d-712d-4666-b45d-30801f09ec9e new file mode 100644 index 0000000000000000000000000000000000000000..56f7dcc9bceb6e3744499a44a4d77c57b76426e8 --- /dev/null +++ b/docstore/8c62fe0d-712d-4666-b45d-30801f09ec9e @@ -0,0 +1 @@ += "gemini-2.5-flash" , contents = [ "Explain how AI works" ], config = types . GenerateContentConfig ( temperature = 0.1 ) ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , config : { temperature : 0.1 , }, }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } temp := float32 ( 0.9 ) topP := float32 ( 0.5 ) topK := float32 ( 20.0 ) config := & genai . GenerateContentConfig { Temperature : & temp , TopP : & topP , TopK : & topK , ResponseMIMEType : "application/json" , } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "What is the average size of a swallow?" ), config , ) fmt . Println ( result . Text ()) } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ], "generationConfig": { "stopSequences": [ "Title" ], "temperature": 1.0, "topP": 0.8, "topK": 10 } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const generationConfig = { temperature : 1 , topP : 0.95 , topK : 40 , responseMimeType : 'text/plain' , }; const payload = { generationConfig , contents : [ { parts : [ { text : 'Explain how AI works in a few words' }, ], }, ], }; const url = \ No newline at end of file diff --git a/docstore/8c92dea1-b3a9-40f6-b66f-bce4a197d90c b/docstore/8c92dea1-b3a9-40f6-b66f-bce4a197d90c new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/8c92dea1-b3a9-40f6-b66f-bce4a197d90c @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/8cbc8688-a4e9-47ce-8db8-99ad7fd79e92 b/docstore/8cbc8688-a4e9-47ce-8db8-99ad7fd79e92 new file mode 100644 index 0000000000000000000000000000000000000000..8aea93175d1ec241651e6c6712461e639d59a270 --- /dev/null +++ b/docstore/8cbc8688-a4e9-47ce-8db8-99ad7fd79e92 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#main-content Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/8cc3194b-1bef-4097-bd79-329b9f579991 b/docstore/8cc3194b-1bef-4097-bd79-329b9f579991 new file mode 100644 index 0000000000000000000000000000000000000000..ea67dfb950f5bf1f1ddc040da181eb1accb5974b --- /dev/null +++ b/docstore/8cc3194b-1bef-4097-bd79-329b9f579991 @@ -0,0 +1 @@ +breakdown:" ) for detail in usage . response_tokens_details : match detail : case types . ModalityTokenCount ( modality = modality , token_count = count ): print ( f " { modality } : { count } " ) JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . usageMetadata ) { console . debug ( 'Used %s tokens in total. Response token breakdown:\n' , turn . usageMetadata . totalTokenCount ); for ( const detail of turn . usageMetadata . responseTokensDetails ) { console . debug ( '%s\n' , detail ); } } } Media resolution You can specify the media resolution for the input media by setting the mediaResolution field as part of the session configuration: Python from google.genai import types config = { "response_modalities" : [ "AUDIO" ], "media_resolution" : types . MediaResolution . MEDIA_RESOLUTION_LOW , } JavaScript import { GoogleGenAI , Modality , MediaResolution } from '@google/genai' ; const config = { responseModalities : [ Modality . TEXT ], mediaResolution : MediaResolution . MEDIA_RESOLUTION_LOW , }; Limitations Consider the following limitations of the Live API when you plan your project. Response modalities You can only set one response modality ( TEXT or AUDIO ) per session in the session configuration. Setting both results in a config error message. This means that you can configure the model to respond with either text or audio, but not both in the same session. Client authentication The Live API only provides server-to-server authentication by default. If you're implementing your Live API application using a client-to-server approach , you need to use ephemeral tokens to mitigate security risks. Session duration Audio-only sessions are limited to 15 minutes, and audio plus video sessions are limited to 2 minutes. However, you can configure different session management techniques for unlimited extensions on session duration. Context window A session has a context window limit of: 128k tokens for native audio output models 32k \ No newline at end of file diff --git a/docstore/8cf3980c-e2ba-44d6-9c97-9a20276a3be6 b/docstore/8cf3980c-e2ba-44d6-9c97-9a20276a3be6 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/8cf3980c-e2ba-44d6-9c97-9a20276a3be6 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/8d18bd4b-e197-49b9-b51a-0c64343a83b1 b/docstore/8d18bd4b-e197-49b9-b51a-0c64343a83b1 new file mode 100644 index 0000000000000000000000000000000000000000..eb6db224edbdd160f04cb946308fd82587e98eec --- /dev/null +++ b/docstore/8d18bd4b-e197-49b9-b51a-0c64343a83b1 @@ -0,0 +1 @@ +marks Spain's record-breaking fourth European Championship title.[5]((https:/...), [2](https:/...), [3](https:/...), [4](https:/...) Pricing When you use Grounding with Google Search, your project is billed per API request that includes the google_search tool. If the model decides to execute multiple search queries to answer a single prompt (for example, searching for "UEFA Euro 2024 winner" and "Spain vs England Euro 2024 final score" within the same API call), this counts as a single billable use of the tool for that request. For detailed pricing information, see the Gemini API pricing page . Supported Models Experimental and Preview models are not included. You can find their capabilities on the model overview page. Model Grounding with Google Search Gemini 2.5 Pro ✔️ Gemini 2.5 Flash ✔️ Gemini 2.0 Flash ✔️ Gemini 1.5 Pro ✔️ Gemini 1.5 Flash ✔️ Note: Older models use a google_search_retrieval tool. For all current models, use the google_search tool as shown in the examples. Grounding with Gemini 1.5 Models (Legacy) While the google_search tool is recommended for Gemini 2.0 and later, Gemini 1.5 support a legacy tool named google_search_retrieval . This tool provides a dynamic mode that allows the model to decide whether to perform a search based on its confidence that the prompt requires fresh information. If the model's confidence is above a dynamic_threshold you set (a value between 0.0 and 1.0), it will perform a search. Python # Note: This is a legacy approach for Gemini 1.5 models. # The 'google_search' tool is recommended for all new development. import os from google import genai from google.genai import types client = genai . Client () retrieval_tool = types . Tool ( google_search_retrieval = types . GoogleSearchRetrieval ( dynamic_retrieval_config = types . DynamicRetrievalConfig ( mode = types . DynamicRetrievalConfigMode . MODE_DYNAMIC , dynamic_threshold = 0.7 # Only search if confidence > 70% ) ) ) config = types . GenerateContentConfig ( tools = [ \ No newline at end of file diff --git a/docstore/8d3daa23-ac80-4be0-95d5-02badbae20d3 b/docstore/8d3daa23-ac80-4be0-95d5-02badbae20d3 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/8d3daa23-ac80-4be0-95d5-02badbae20d3 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/8d6e7d14-80f5-4c91-b550-d9551acefa89 b/docstore/8d6e7d14-80f5-4c91-b550-d9551acefa89 new file mode 100644 index 0000000000000000000000000000000000000000..58a0a85d29a664bbdbdb8d3a93c58460ef8ef554 --- /dev/null +++ b/docstore/8d6e7d14-80f5-4c91-b550-d9551acefa89 @@ -0,0 +1 @@ += []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); And here is a text-to-audio example. You can receive audio by setting AUDIO as response modality. This example saves the received data as WAV file: \ No newline at end of file diff --git a/docstore/8d7d1656-488d-439e-bc8e-83f04a98e681 b/docstore/8d7d1656-488d-439e-bc8e-83f04a98e681 new file mode 100644 index 0000000000000000000000000000000000000000..7c3d98af9909034f92832ab4dcc3a7220e5c9856 --- /dev/null +++ b/docstore/8d7d1656-488d-439e-bc8e-83f04a98e681 @@ -0,0 +1 @@ +operation = await ai . models . generateVideos ({ model : "veo-2.0-generate-001" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , image : { imageBytes : response . generatedImages [ 0 ]. image . imageBytes , // response from Imagen mimeType : "image/png" , }, config : { aspectRatio : "16:9" , numberOfVideos : 2 , }, }); while ( ! operation . done ) { await new Promise (( resolve ) = > setTimeout ( resolve , 10000 )); operation = await ai . operations . getVideosOperation ({ operation : operation , }); } operation . response ? . generatedVideos ? . forEach ( async ( generatedVideo , n ) = > { const resp = await fetch ( ` ${ generatedVideo . video ? . uri } & key=GEMINI_API_KEY` , // append your API key ); const writer = createWriteStream ( `video ${ n } .mp4` ); Readable . fromWeb ( resp . body ). pipe ( writer ); }); } main (); Go image := response . GeneratedImages [ 0 ]. Image videoConfig := & genai . GenerateVideosConfig { AspectRatio : "16:9" , NumberOfVideos : 2 , } operation , _ := client . Models . GenerateVideos ( ctx , "veo-2.0-generate-001" , "A dramatic scene based on the input image" , image , videoConfig , ) for ! operation . Done { time . Sleep ( 20 * time . Second ) operation , _ = client . Operations . GetVideosOperation ( ctx , operation , nil ) } for n , video := range operation . Response . GeneratedVideos { client . Files . Download ( ctx , video . Video , nil ) fname := fmt . Sprintf ( "video_with_image_input_%d.mp4" , n ) _ = os . WriteFile ( fname , video . Video . VideoBytes , 0644 ) } Veo model parameters (Naming conventions vary by programming language.) prompt : The text prompt for the video. When present, the image parameter is optional. image : The image to use as the first frame for the video. When present, the prompt parameter is optional. negativePrompt : Text string that describes anything you want to discourage the model from generating aspectRatio : Changes the aspect ratio of the generated video. \ No newline at end of file diff --git a/docstore/8d90470d-a5e1-4e7d-8f53-6b668f364607 b/docstore/8d90470d-a5e1-4e7d-8f53-6b668f364607 new file mode 100644 index 0000000000000000000000000000000000000000..1540f812946831d7975c774be202eaa83cf52504 --- /dev/null +++ b/docstore/8d90470d-a5e1-4e7d-8f53-6b668f364607 @@ -0,0 +1 @@ +"CiwBVKhc7nRyTi3HmggPD9iQiRc261f5jwuMdw3H/itDH0emsb9ZVo3Nwx9p6wpsAVSoXO5i8fDV4jBSBLoaWxB5zUdlGY6aIGp+I0oEnwRRSRQ1LOvrDlojEH8JE8HjiKXALdJrvNPiG+HY3GZEO8pZjEZtc3UoBUh7+SVyjK7Xolu7aRYYeUyzrCapoETWypER1jbrJXnFV23hCosBAVSoXO6oIPNJSmbuEDfGafOhuCSHkpr1yjTp35RXYqmCESzRzWf5+nFXLqncqeFo4ohoxbiYQVpVQbOZF81p8o9zg6xeRE7qMeOv+XN7enXGJ4/s3qNFQpfkSMqRdBITN1VpX7jyfEAjvxBNc7PDfDJZmEPY338ZIY5nFFcmzJSWjVrboFt2sMFv+A==" } ] , "role" : "model" } , "finishReason" : "STOP" , "index" : 0 } ] , # Remainder of response... You can confirm that you received a signature and see what a signature looks like using the following code: # Step 2: Call the model with function declarations # ...Generation config, Configure the client, and Define user prompt (No changes) # Send request with declarations (using a thinking model) response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents ) # See thought signatures for part in response . candidates [ 0 ] . content . parts : if part . thought_signature : print ( "Thought signature:" ) print ( part . thought_signature ) Returning signatures back to the server In order to return signatures back: You should return signatures along with their containing parts back to the server You shouldn't merge a part with a signature with another part which also contains a signature. The signature string is not concatenable You shouldn't merge one part with a signature with another part without a signature. This breaks the correct positioning of the thought represented by the signature. The code will remain the same as in Step 4 of the previous section. But in this case (as indicated in the comment below) you will return signatures to the model along with the result of the function execution so the model can incorporate the thoughts into its final response: Python # Step 4: Create user friendly response with function result and call the model again # ...Create a function response part (No change) # Append thought \ No newline at end of file diff --git a/docstore/8d9b3548-6927-4748-9323-caef88535c68 b/docstore/8d9b3548-6927-4748-9323-caef88535c68 new file mode 100644 index 0000000000000000000000000000000000000000..8759a03a20a3177c7734cd1638fb9c60e8d9d57e --- /dev/null +++ b/docstore/8d9b3548-6927-4748-9323-caef88535c68 @@ -0,0 +1 @@ +popularized by short form video apps (for example, YouTube shorts). Use this for tall objects with strong vertical orientations such as buildings, trees, waterfalls, or other similar objects. Prompt: a digital render of a massive skyscraper, modern, grand, epic with a beautiful sunset in the background (9:16 aspect ratio) Photorealistic images Different versions of the image generation model might offer a mix of artistic and photorealistic output. Use the following wording in prompts to generate more photorealistic output, based on the subject you want to generate. Note: Take these keywords as general guidance when you try to create photorealistic images. They aren't required to achieve your goal. Use case Lens type Focal lengths Additional details People (portraits) Prime, zoom 24-35mm black and white film, Film noir, Depth of field, duotone (mention two colors) Food, insects, plants (objects, still life) Macro 60-105mm High detail, precise focusing, controlled lighting Sports, wildlife (motion) Telephoto zoom 100-400mm Fast shutter speed, Action or movement tracking Astronomical, landscape (wide-angle) Wide-angle 10-24mm Long exposure times, sharp focus, long exposure, smooth water or clouds Portraits Use case Lens type Focal lengths Additional details People (portraits) Prime, zoom 24-35mm black and white film, Film noir, Depth of field, duotone (mention two colors) Using several keywords from the table, Imagen can generate the following portraits: Prompt: A woman, 35mm portrait, blue and grey duotones Model: imagen-3.0-generate-002 Prompt: A woman, 35mm portrait, film noir Model: imagen-3.0-generate-002 Objects Use case Lens type Focal lengths Additional details Food, insects, plants (objects, still life) Macro 60-105mm High detail, precise focusing, controlled lighting Using several keywords from the table, Imagen can generate the following object images: Prompt: leaf of a prayer plant, macro lens, 60mm Model: imagen-3.0-generate-002 Prompt: a plate of pasta, \ No newline at end of file diff --git a/docstore/8db8fc35-2116-47bf-aa99-148bf673238c b/docstore/8db8fc35-2116-47bf-aa99-148bf673238c new file mode 100644 index 0000000000000000000000000000000000000000..6adba52776c85866a9f7e3e3060c0512ed2f447c --- /dev/null +++ b/docstore/8db8fc35-2116-47bf-aa99-148bf673238c @@ -0,0 +1 @@ +about image prompting, see the Imagen prompt guide To learn about video prompting, see the Veo prompt guide Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-04-28 UTC. \ No newline at end of file diff --git a/docstore/8ddaa7cf-6a5d-470f-83fd-82133d236ad7 b/docstore/8ddaa7cf-6a5d-470f-83fd-82133d236ad7 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/8ddaa7cf-6a5d-470f-83fd-82133d236ad7 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/8de01de6-979e-422b-a27e-e242a6450d3b b/docstore/8de01de6-979e-422b-a27e-e242a6450d3b new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/8de01de6-979e-422b-a27e-e242a6450d3b @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/8e0ff6d2-dec8-4a5c-b57d-b6c91caf474a b/docstore/8e0ff6d2-dec8-4a5c-b57d-b6c91caf474a new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/8e0ff6d2-dec8-4a5c-b57d-b6c91caf474a @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/8e13aab4-ace0-4324-b4a2-49e7d49dbf18 b/docstore/8e13aab4-ace0-4324-b4a2-49e7d49dbf18 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/8e13aab4-ace0-4324-b4a2-49e7d49dbf18 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/8e4957f2-dacb-4df9-9772-7a46464429c0 b/docstore/8e4957f2-dacb-4df9-9772-7a46464429c0 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/8e4957f2-dacb-4df9-9772-7a46464429c0 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/8e5e0034-5b51-4ebc-beb4-5be0fcc0c975 b/docstore/8e5e0034-5b51-4ebc-beb4-5be0fcc0c975 new file mode 100644 index 0000000000000000000000000000000000000000..3d32a6c6f44782138d2600dc9a5e7c5bf75a9a24 --- /dev/null +++ b/docstore/8e5e0034-5b51-4ebc-beb4-5be0fcc0c975 @@ -0,0 +1 @@ +in 3 sentences."}, { "file_data": { "file_uri": "https://www.youtube.com/watch?v=9hE5-98ZeCg" } } ] }] }' 2 > /dev/null Refer to timestamps in the content You can ask questions about specific points in time within the video using timestamps of the form MM:SS . Python prompt = "What are the examples given at 00:05 and 00:10 supposed to show us?" # Adjusted timestamps for the NASA video JavaScript const prompt = "What are the examples given at 00:05 and 00:10 supposed to show us?" ; Go prompt := [] * genai . Part { genai . NewPartFromURI ( currentVideoFile . URI , currentVideoFile . MIMEType ), // Adjusted timestamps for the NASA video genai . NewPartFromText ( "What are the examples given at 00:05 and " + "00:10 supposed to show us?" ), } REST PROMPT = "What are the examples given at 00:05 and 00:10 supposed to show us?" Transcribe video and provide visual descriptions The Gemini models can transcribe and provide visual descriptions of video content by processing both the audio track and visual frames. For visual descriptions, the model samples the video at a rate of 1 frame per second . This sampling rate may affect the level of detail in the descriptions, particularly for videos with rapidly changing visuals. Python prompt = "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions." JavaScript const prompt = "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions." ; Go prompt := [] * genai . Part { genai . NewPartFromURI ( currentVideoFile . URI , currentVideoFile . MIMEType ), genai . NewPartFromText ( "Transcribe the audio from this video, giving timestamps for salient events in the video. Also " + "provide visual descriptions." ), } REST PROMPT = "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions." Customize video processing You can customize video processing \ No newline at end of file diff --git a/docstore/8e744312-5cd0-4c66-9434-bdedf451689d b/docstore/8e744312-5cd0-4c66-9434-bdedf451689d new file mode 100644 index 0000000000000000000000000000000000000000..a99dcaa8b13f3b83504ad1b35c796f8fc297615c --- /dev/null +++ b/docstore/8e744312-5cd0-4c66-9434-bdedf451689d @@ -0,0 +1 @@ +] # Execute the prompt with specified tools in audio modality await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Multiple tasks example - combining lights, code execution, and search const prompt = ` Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! ` ; const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } // not defined here. ]; // Execute the prompt with specified tools in audio modality await run ( prompt , { tools : tools , modality : "AUDIO" }); Python developers can try this out in the Live API Tool Use notebook . Model context protocol (MCP) Model Context Protocol (MCP) is an open standard for connecting AI applications with external tools and data. MCP provides a common protocol for models to access context, such as functions (tools), data sources (resources), or predefined prompts. The Gemini SDKs have built-in support for the MCP, reducing boilerplate code and offering automatic tool calling for MCP tools. When the model generates an MCP tool call, the Python and JavaScript client SDK can automatically execute the MCP tool and send the response back to the model in a subsequent request, continuing this loop until no more tool calls are made by the model. Here, you can find an example of how to use a local MCP server with Gemini and mcp SDK. Python Make sure the latest version of the mcp SDK is installed on your platform of choice. pip install mcp Note: Python supports automatic tool calling by passing in the ClientSession into the tools parameters. If you want to disable it, you can provide automatic_function_calling with disabled True . import os import asyncio from datetime import datetime from mcp import ClientSession , StdioServerParameters from \ No newline at end of file diff --git a/docstore/8e7dce2a-6919-4c04-9e81-2364c066ecc8 b/docstore/8e7dce2a-6919-4c04-9e81-2364c066ecc8 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/8e7dce2a-6919-4c04-9e81-2364c066ecc8 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/8e8c573a-82b0-4038-88bb-1692dab9f12c b/docstore/8e8c573a-82b0-4038-88bb-1692dab9f12c new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/8e8c573a-82b0-4038-88bb-1692dab9f12c @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/8e91b339-d589-41c9-bf03-fa5de2b125a4 b/docstore/8e91b339-d589-41c9-bf03-fa5de2b125a4 new file mode 100644 index 0000000000000000000000000000000000000000..90311e83465fc930174e1787a7822757d628fc0a --- /dev/null +++ b/docstore/8e91b339-d589-41c9-bf03-fa5de2b125a4 @@ -0,0 +1 @@ +setLightValues ( tool_call . args . brightness , tool_call . args . color_temp ); console . log ( `Function execution result: ${ JSON . stringify ( result ) } ` ); } Step 4: Create user friendly response with function result and call the model again Finally, send the result of the function execution back to the model so it can incorporate this information into its final response to the user. Python # Create a function response part function_response_part = types . Part . from_function_response ( name = tool_call . name , response = { "result" : result }, ) # Append function call and result of the function execution to contents contents . append ( response . candidates [ 0 ] . content ) # Append the content from the model's response. contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Create a function response part const function_response_part = { name : tool_call . name , response : { result } } // Append function call and result of the function execution to contents contents . push ( response . candidates [ 0 ]. content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); // Get the final response from the model const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); This completes the function calling flow. The model successfully used the set_light_values function to perform the request action of the user. Function declarations When you implement function calling in a prompt, you create a tools object, which contains one or more function declarations . You define functions using JSON, specifically with a select subset of the OpenAPI schema format. A single function \ No newline at end of file diff --git a/docstore/8e96dace-4d15-47ae-91cd-69b9c6dd9589 b/docstore/8e96dace-4d15-47ae-91cd-69b9c6dd9589 new file mode 100644 index 0000000000000000000000000000000000000000..d028e190ded4732dff341fe647fe7532c210272b --- /dev/null +++ b/docstore/8e96dace-4d15-47ae-91cd-69b9c6dd9589 @@ -0,0 +1 @@ +second image shows an empty hallway in a shopping mall, which is a place where people often go to buy things for their trips. The third image shows a stack of toilet paper, which is something that people often use when they are traveling. The fourth image shows a dark and stormy sea, which is something that people often encounter when they are traveling. The same tips work for creative output from images. In the following example with the rabbit, explicitly asking the model to include aspects from the image can improve the output. Prompt Model response Can you write me a poem about this image? Little bunny, so soft and brown, Hopping through the field, up and down. Your ears so long, your tail so small, You're the cutest creature of them all. You nibble on the grass, so green and sweet, And hop away when you see me. But I'll always be your friend, Little bunny, until the very end. Updated prompt Improved response Can you write me a descriptive and dramatic poem about this image and include the location? Little bunny on the green, Ears so long, so keen. What do you see out there, In the distance, where you stare? Is it the ocean blue, Sparkling in the sun's hue? Or the waves that crash and roar, A rhythm that you adore? Little bunny, tell me true, What captures your attention so? Is it the salty breeze, Or the seagulls' cries that please? Perhaps it's the vastness of it all, A world so big and small. A place where dreams can roam, And adventures can be sown. Little bunny, I wonder what you dream, As you sit on the grass, so serene. Do you yearn to explore the deep, Or stay on land, where you can leap? Whatever it may be, little bunny, Keep that spark of wonder burning brightly. For in your dreams and aspirations, Lies a world waiting for your creation. Troubleshooting which part of the prompt failed It can be hard to know whether a prompt failed because the model didn't understand the image to begin with, or if it did understand the image but did not perform the \ No newline at end of file diff --git a/docstore/8ea7b34a-69da-4ef9-aee3-0a71251eaccf b/docstore/8ea7b34a-69da-4ef9-aee3-0a71251eaccf new file mode 100644 index 0000000000000000000000000000000000000000..7ad07eb45fff1ffd88928a8c1191c40c43412859 --- /dev/null +++ b/docstore/8ea7b34a-69da-4ef9-aee3-0a71251eaccf @@ -0,0 +1 @@ +public domain and does not show identifiable people. ( NASA image and media usage guidelines. ) The following code downloads the sample video, uploads it using the File API, waits for it to be processed, and then uses the file reference in a generateContent request. Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp4" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ myfile , "Summarize this video. Then create a quiz with an answer key based on the information in this video." ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp4" , config : { mimeType : "video/mp4" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Summarize this video. Then create a quiz with an answer key based on the information in this video." , ]), }); console . log ( response . text ); } await main (); Go uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.mp4" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Summarize this video. Then create a quiz with an answer key based on the information in this video." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST VIDEO_PATH = "path/to/sample.mp4" MIME_TYPE = $( file -b --mime-type " ${ VIDEO_PATH } " ) NUM_BYTES = $( wc -c < " ${ VIDEO_PATH } " ) DISPLAY_NAME = VIDEO tmp_header_file = upload-header.tmp echo "Starting file \ No newline at end of file diff --git a/docstore/8ecb79fd-092d-41cb-a822-bb02f9883d62 b/docstore/8ecb79fd-092d-41cb-a822-bb02f9883d62 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/8ecb79fd-092d-41cb-a822-bb02f9883d62 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/8eccfeff-f7b3-4c9d-800e-ea122eb6c755 b/docstore/8eccfeff-f7b3-4c9d-800e-ea122eb6c755 new file mode 100644 index 0000000000000000000000000000000000000000..8a34a1fe66a041005f53a5e081e09b0fa5f13242 --- /dev/null +++ b/docstore/8eccfeff-f7b3-4c9d-800e-ea122eb6c755 @@ -0,0 +1 @@ +Grounding with Google Search | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Grounding with Google Search Grounding with Google Search connects the Gemini model to real-time web content and works with all available languages . This allows Gemini to provide more accurate answers and cite verifiable sources beyond its knowledge cutoff. Grounding helps you build applications that can: Increase factual accuracy: Reduce model hallucinations by basing responses on real-world information. Access real-time information: Answer questions about recent events and topics. Provide citations: Build user trust by showing the sources for the model's claims. Python from google import genai from google.genai import types # Configure the client client = genai . Client () # Define the grounding tool grounding_tool = types . Tool ( google_search = types . GoogleSearch () ) # Configure generation settings config = types . GenerateContentConfig ( tools = [ grounding_tool ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Who won the euro 2024?" , config = config , ) # Print the grounded response print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Define the grounding tool const groundingTool = { googleSearch : {}, }; // Configure generation settings const config = { tools : [ groundingTool ], }; // Make the request const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Who won the euro 2024?" , config , }); // Print the grounded response console . log ( response . \ No newline at end of file diff --git a/docstore/8ed0f5d1-8b81-40b1-b0a6-8e37e4ded488 b/docstore/8ed0f5d1-8b81-40b1-b0a6-8e37e4ded488 new file mode 100644 index 0000000000000000000000000000000000000000..9b0480f81615786d4da1611ac72ab96b5af43602 --- /dev/null +++ b/docstore/8ed0f5d1-8b81-40b1-b0a6-8e37e4ded488 @@ -0,0 +1 @@ +"CiIBVKhc7oDPpCaXyJKKssjqr4g3JNOSgJ/M2V+1THC1icsWCmwBVKhc7pBABbZ+zR3e9234WnWWS6GFXmf8IVwpnzjd5KYd7vyJbn/4vTorWBGayj/vbd9JPaZQjxdAIXhoE5mX/MDsQ7M9N/b0qJjHm39tYIBvS4sIWkMDHqTJqXGLzhhKtrTkfbV3RbaJEkQKmwEBVKhc7qVUgC3hfTXZLo9R3AJzUUIx50NKvJTb9B+UU+LBqgg7Nck1x5OpjWVS2R+SsveprIuYOruk2Y0H53J2OJF8qsxTdIq2si8DGW2V7WK8xyoJH5kbqd7drIw1jLb44b6lx4SMyB0VaULuTBki4d+Ljjg1tJTwR0IYMKqDLDZt9mheINsi0ZxcNjfpnDydRXdWbcSwzmK/wgqJAQFUqFzuKgNVElxs3cbO+xebr2IwcOro84nKTisi0tTp9bICPC9fTUhn3L+rvQWA+d3J1Za8at2bakrqiRj7BTh+CVO9fWQMAEQAs3ni0Z2hfaYG92tOD26E4IoZwyYEoWbfNudpH1fr5tEkyqnEGtWIh7H+XoZQ2DXeiOa+br7Zk88SrNE+trJMCogBAVSoXO5e9fBLg7hnbkmKsrzNLnQtLsQm1gNzjcjEC7nJYklYPp0KI2uGBE1PkM8XNsfllAfHVn7LzHcHNlbQ9pJ7QZTSIeG42goS971r5wNZwxaXwCTphClQh826eqJWo6A/28TtAVQWLhTx5ekbP7qb4nh1UblESZ1saxDQAEo4OKPbDzx5BgqKAQFUqFzuVyjNm5i0wN8hTDnKjfpDroEpPPTs531iFy9BOX+xDCdGHy8D+osFpaoBq6TFekQQbz4hIoUR1YEcP4zI80/cNimEeb9IcFxZTTxiNrbhbbcv0969DSMWhB+ZEqIz4vuw4GLe/xcUvqhlChQwFdgIbdOQHSHpatn5uDlktnP/bi26nKuXIwo0AVSoXO7US22OUH7d1f4abNPI0IyAvhqkPp12rbtWLx9vkOtojE8IP+xCfYtIFuZIzRNZqA==" } ] , "role" : "model" } , { "role" : "user" , "parts" : [ { "functionResponse" : { "name" : "getWeather" , "response" : { "response" : { "stringValue" : "Sunny and hot. 90 degrees Fahrenheit" } } } } ] } ] , # Remainder of request... Learn more about limitations and usage of thought signatures, and about thinking models in general, on the Thinking page. Parallel function calling In addition to single turn function calling, you can also call multiple functions at once. Parallel function calling lets you execute multiple functions at once and is used when the functions are not dependent on each other. This is useful in scenarios like gathering data from multiple independent sources, such as retrieving customer details from different databases or checking inventory levels across various warehouses or performing multiple actions such as converting your apartment into a disco. Python power_disco_ball = { "name" : "power_disco_ball" , "description" \ No newline at end of file diff --git a/docstore/8edae69b-a254-4ff3-8104-86f8b185af3a b/docstore/8edae69b-a254-4ff3-8104-86f8b185af3a new file mode 100644 index 0000000000000000000000000000000000000000..1b590529498df147a73d1ae4a22f439d08a36cb6 --- /dev/null +++ b/docstore/8edae69b-a254-4ff3-8104-86f8b185af3a @@ -0,0 +1 @@ +from "node:fs" ; const ai = new GoogleGenAI ({}); const base64ImageFile = fs . readFileSync ( "path/to/small-sample.jpg" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "image/jpeg" , data : base64ImageFile , }, }, { text : "Caption this image." }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go bytes , _ := os . ReadFile ( "path/to/small-sample.jpg" ) parts := [] * genai . Part { genai . NewPartFromBytes ( bytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST IMG_PATH = "/path/to/your/image1.jpg" if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "' " $( base64 $B64FLAGS $IMG_PATH ) " '" } }, {"text": "Caption this image."}, ] }] }' 2 > /dev/null You can also fetch an image from a URL, convert it to bytes, and pass it to generateContent as shown in the following examples. Python from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests . get ( image_path ) . content image = types . Part . from_bytes ( data = image_bytes , mime_type = "image/jpeg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is this image?" , image ], ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; \ No newline at end of file diff --git a/docstore/8ee6ab1e-ede4-411f-8e92-af76e16024cc b/docstore/8ee6ab1e-ede4-411f-8e92-af76e16024cc new file mode 100644 index 0000000000000000000000000000000000000000..70023e78d7b8c9459310371555beec5ba6328e28 --- /dev/null +++ b/docstore/8ee6ab1e-ede4-411f-8e92-af76e16024cc @@ -0,0 +1 @@ +results reflects a single function call that the model has requested. To send the results back, include the responses in the same order as they were requested. The Python SDK supports automatic function calling , which automatically converts Python functions to declarations, handles the function call execution and response cycle for you. Following is an example for the disco use case. Note: Automatic Function Calling is a Python SDK only feature at the moment. Python from google import genai from google.genai import types # Actual function implementations def power_disco_ball_impl ( power : bool ) - > dict : """Powers the spinning disco ball. Args: power: Whether to turn the disco ball on or off. Returns: A status dictionary indicating the current state. """ return { "status" : f "Disco ball powered { 'on' if power else 'off' } " } def start_music_impl ( energetic : bool , loud : bool ) - > dict : """Play some music matching the specified parameters. Args: energetic: Whether the music is energetic or not. loud: Whether the music is loud or not. Returns: A dictionary containing the music settings. """ music_type = "energetic" if energetic else "chill" volume = "loud" if loud else "quiet" return { "music_type" : music_type , "volume" : volume } def dim_lights_impl ( brightness : float ) - > dict : """Dim the lights. Args: brightness: The brightness of the lights, 0.0 is off, 1.0 is full. Returns: A dictionary containing the new brightness setting. """ return { "brightness" : brightness } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ power_disco_ball_impl , start_music_impl , dim_lights_impl ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Do everything you need to this place into party!" , config = config , ) print ( " \n Example 2: Automatic function calling" ) print ( response . text ) # I've turned on the disco ball, started playing loud and \ No newline at end of file diff --git a/docstore/8ef60e43-35ae-4a6a-aaac-7d7343aba24d b/docstore/8ef60e43-35ae-4a6a-aaac-7d7343aba24d new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/8ef60e43-35ae-4a6a-aaac-7d7343aba24d @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/8efa2def-e945-4c65-8c00-fbcad6a8e08c b/docstore/8efa2def-e945-4c65-8c00-fbcad6a8e08c new file mode 100644 index 0000000000000000000000000000000000000000..8d1e7e8a7a566917639b9515f69a698a37f27a8d --- /dev/null +++ b/docstore/8efa2def-e945-4c65-8c00-fbcad6a8e08c @@ -0,0 +1 @@ +to avoid. ⛔ Negative pattern: Example Don't end haikus with a question: Haiku are fun A short and simple poem Don't you enjoy them? ✅ Positive pattern: Example Always end haikus with an assertion: Haiku are fun A short and simple poem A joy to write Consistent formatting Make sure that the structure and formatting of few-shot examples are the same to avoid responses with undesired formats. One of the primary objectives of adding few-shot examples in prompts is to show the model the response format. Therefore, it is essential to ensure a consistent format across all examples, especially paying attention to XML tags, white spaces, newlines, and example splitters. Add context You can include instructions and information in a prompt that the model needs to solve a problem, instead of assuming that the model has all of the required information. This contextual information helps the model understand the constraints and details of what you're asking for it to do. The following example asks the model to give troubleshooting guidance for a router: Prompt: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Response: A yellow blinking light on your Google Wifi router usually indicates a problem with the internet connection. Here are some steps to troubleshoot and fix the issue: 1. Restart your router and modem: - Unplug your modem from the power outlet and wait for 30 seconds. - Plug your modem back in and wait for it to fully boot up. - Unplug your Google Wifi router and wait for 30 seconds. - Plug your Google Wifi router back in and wait for it to fully boot up. ..... (gemini-2.5-flash) The response looks like generic troubleshooting information that's not specific to the router or the status of the LED indicator lights. To customize the response for the specific router, you can add to the prompt the router's troubleshooting guide as context for it to refer to when providing a response. Prompt: Answer the question \ No newline at end of file diff --git a/docstore/8f0b490c-f367-4030-9c6b-7e445b7652ee b/docstore/8f0b490c-f367-4030-9c6b-7e445b7652ee new file mode 100644 index 0000000000000000000000000000000000000000..e7df3841ca47d093238d5ecfeb9f7cc95942f8e5 --- /dev/null +++ b/docstore/8f0b490c-f367-4030-9c6b-7e445b7652ee @@ -0,0 +1 @@ +data image2Path := "path/to/image2.jpeg" imgBytes , _ := os . ReadFile ( image2Path ) parts := [] * genai . Part { genai . NewPartFromText ( "What is different between these two images?" ), genai . NewPartFromBytes ( imgBytes , "image/jpeg" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST # Upload the first image IMAGE1_PATH = "path/to/image1.jpg" MIME1_TYPE = $( file -b --mime-type " ${ IMAGE1_PATH } " ) NUM1_BYTES = $( wc -c < " ${ IMAGE1_PATH } " ) DISPLAY_NAME1 = IMAGE1 tmp_header_file1 = upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME1_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME1 } '}}" 2 > /dev/null upload_url1 = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file1 } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file1 } " curl " ${ upload_url1 } " \ -H "Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE1_PATH } " 2 > /dev/null > file_info1.json file1_uri = $( jq ".file.uri" file_info1.json ) echo file1_uri = $file1_uri # Prepare the second image (inline) IMAGE2_PATH = "path/to/image2.png" MIME2_TYPE = $( file -b --mime-type " ${ IMAGE2_PATH } " ) if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMAGE2_BASE64 = $( base64 $B64FLAGS $IMAGE2_PATH ) # Now generate content using both images curl \ No newline at end of file diff --git a/docstore/8f2b2755-0e93-4f62-9aaf-a8df4234e17c b/docstore/8f2b2755-0e93-4f62-9aaf-a8df4234e17c new file mode 100644 index 0000000000000000000000000000000000000000..185cd7c14b73dd4804292716b4231cde98556b13 --- /dev/null +++ b/docstore/8f2b2755-0e93-4f62-9aaf-a8df4234e17c @@ -0,0 +1 @@ +Safety settings | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Safety settings The Gemini API provides safety settings that you can adjust during the prototyping stage to determine if your application requires more or less restrictive safety configuration. You can adjust these settings across five filter categories to restrict or allow certain types of content. This guide covers how the Gemini API handles safety settings and filtering and how you can change the safety settings for your application. Note: Applications that use less restrictive safety settings may be subject to review. See the Terms of Service for more information. Safety filters The Gemini API's adjustable safety filters cover the following categories: Category Description Harassment Negative or harmful comments targeting identity and/or protected attributes. Hate speech Content that is rude, disrespectful, or profane. Sexually explicit Contains references to sexual acts or other lewd content. Dangerous Promotes, facilitates, or encourages harmful acts. Civic integrity Election-related queries. These categories are defined in HarmCategory . The Gemini models only support HARM_CATEGORY_HARASSMENT , HARM_CATEGORY_HATE_SPEECH , HARM_CATEGORY_SEXUALLY_EXPLICIT , HARM_CATEGORY_DANGEROUS_CONTENT , and HARM_CATEGORY_CIVIC_INTEGRITY . All other categories are used only by PaLM 2 (Legacy) models. You can use these filters to adjust what's appropriate for your use case. For example, if you're building video game dialogue, you may deem it acceptable to allow more content that's rated as Dangerous due to the nature of the game. In addition to the adjustable safety filters, the \ No newline at end of file diff --git a/docstore/8f4d9fa6-51c5-4623-b2f5-a6cbfdd3430f b/docstore/8f4d9fa6-51c5-4623-b2f5-a6cbfdd3430f new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/8f4d9fa6-51c5-4623-b2f5-a6cbfdd3430f @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/8f52efd9-8c06-412e-8263-36e9cef89a55 b/docstore/8f52efd9-8c06-412e-8263-36e9cef89a55 new file mode 100644 index 0000000000000000000000000000000000000000..6ba79c2c7031e707d89e2b23470a5d9ee375cc5c --- /dev/null +++ b/docstore/8f52efd9-8c06-412e-8263-36e9cef89a55 @@ -0,0 +1 @@ +mcp.client.stdio import stdio_client from google import genai client = genai . Client () # Create server parameters for stdio connection server_params = StdioServerParameters ( command = "npx" , # Executable args = [ "-y" , "@philschmid/weather-mcp" ], # MCP Server env = None , # Optional environment variables ) async def run (): async with stdio_client ( server_params ) as ( read , write ): async with ClientSession ( read , write ) as session : # Prompt to get the weather for the current day in London. prompt = f "What is the weather in London in { datetime . now () . strftime ( '%Y-%m- %d ' ) } ?" # Initialize the connection between client and server await session . initialize () # Send request to the model with MCP function declarations response = await client . aio . models . generate_content ( model = "gemini-2.5-flash" , contents = prompt , config = genai . types . GenerateContentConfig ( temperature = 0 , tools = [ session ], # uses the session, will automatically call the tool # Uncomment if you **don't** want the SDK to automatically call the tool # automatic_function_calling=genai.types.AutomaticFunctionCallingConfig( # disable=True # ), ), ) print ( response . text ) # Start the asyncio event loop and run the main function asyncio . run ( run ()) JavaScript Make sure the latest version of the mcp SDK is installed on your platform of choice. npm install @modelcontextprotocol/sdk Note: JavaScript supports automatic tool calling by wrapping the client with mcpToTool . If you want to disable it, you can provide automaticFunctionCalling with disabled true . import { GoogleGenAI , FunctionCallingConfigMode , mcpToTool } from '@google/genai' ; import { Client } from "@modelcontextprotocol/sdk/client/index.js" ; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js" ; // Create server parameters for stdio connection const serverParams = new StdioClientTransport ({ command : "npx" , // Executable args : [ "-y" , "@philschmid/weather-mcp" \ No newline at end of file diff --git a/docstore/8f53d7d0-72d0-4db6-ae53-9b2c1726b467 b/docstore/8f53d7d0-72d0-4db6-ae53-9b2c1726b467 new file mode 100644 index 0000000000000000000000000000000000000000..e6f71e62f1d38e5969349ef563bd1d1143e3d3e1 --- /dev/null +++ b/docstore/8f53d7d0-72d0-4db6-ae53-9b2c1726b467 @@ -0,0 +1 @@ +shows you how to specify ambiance. Ambiance Prompt Generated output Color palettes play a vital role in photography, influencing the mood and conveying intended emotions. Try things like "muted orange warm tones," "natural light," "sunrise" or "sunset". For example, a warm, golden palette can infuse a romantic and atmospheric feel into a photograph. A close-up of a girl holding adorable golden retriever puppy in the park, sunlight. Cinematic close-up shot of a sad woman riding a bus in the rain, cool blue tones, sad mood. Use reference images to generate videos You can bring images to life by using Veo's image-to-video capability. You can use existing assets, or try Imagen to generate something new. Prompt Generated output Bunny with a chocolate candy bar. Bunny runs away. Negative prompts Negative prompts can be a powerful tool to help specify elements you don't want in the video. Describe what you want to discourage the model from generating after the phrase "Negative prompt". Follow these tips: ❌ Don't use instructive language or words like no or don't . For example, "No walls" or "don't show walls". ✅ Do describe what you don't want to see. For example, "wall, frame", which means that you don't want a wall or a frame in the video. Prompt Generated output Generate a short, stylized animation of a large, solitary oak tree with leaves blowing vigorously in a strong wind. The tree should have a slightly exaggerated, whimsical form, with dynamic, flowing branches. The leaves should display a variety of autumn colors, swirling and dancing in the wind. The animation should use a warm, inviting color palette. Generate a short, stylized animation of a large, solitary oak tree with leaves blowing vigorously in a strong wind. The tree should have a slightly exaggerated, whimsical form, with dynamic, flowing branches. The leaves should display a variety of autumn colors, swirling and dancing in the wind. The animation should use a warm, inviting color palette. With negative \ No newline at end of file diff --git a/docstore/8f582426-068f-4ba1-8c8b-697e3814fcc7 b/docstore/8f582426-068f-4ba1-8c8b-697e3814fcc7 new file mode 100644 index 0000000000000000000000000000000000000000..8c60a97b59d947e95247d6e4ee3eb21605ab2ae3 --- /dev/null +++ b/docstore/8f582426-068f-4ba1-8c8b-697e3814fcc7 @@ -0,0 +1 @@ +open ( video_file_name , 'rb' ) . read () response = client . models . generate_content ( model = 'models/gemini-2.0-flash' , contents = types . Content ( parts = [ types . Part ( inline_data = types . Blob ( data = video_bytes , mime_type = 'video/mp4' ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const base64VideoFile = fs . readFileSync ( "path/to/small-sample.mp4" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "video/mp4" , data : base64VideoFile , }, }, { text : "Please summarize the video in 3 sentences." } ]; const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : contents , }); console . log ( response . text ); REST Note: If you get an Argument list too long error, the base64 encoding of your file might be too long for the curl command line. Use the File API method instead for larger files. VIDEO_PATH = /path/to/your/video.mp4 if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"video/mp4", "data": "' $( base64 $B64FLAGS $VIDEO_PATH ) '" } }, {"text": "Please summarize the video in 3 sentences."} ] }] }' 2 > /dev/null Include a YouTube URL Preview: The YouTube URL feature is in preview and is available at no charge. Pricing and rate limits are likely to change. The Gemini API and AI Studio support YouTube URLs as a file data Part . You can include a YouTube URL with a prompt asking the model to summarize, translate, or otherwise interact with the video content. Limitations: For the free tier, you can't upload more than 8 hours of \ No newline at end of file diff --git a/docstore/8f5efe63-ea7a-44e8-bd2e-74185b555cdf b/docstore/8f5efe63-ea7a-44e8-bd2e-74185b555cdf new file mode 100644 index 0000000000000000000000000000000000000000..f1254740842b565388da4397f822e545092a5b88 --- /dev/null +++ b/docstore/8f5efe63-ea7a-44e8-bd2e-74185b555cdf @@ -0,0 +1 @@ +NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) stream := chat . SendMessageStream ( ctx , genai . Part { Text : "How many paws are in my house?" }) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt = sse \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Supported models All models in the Gemini family support text generation. To learn more about the models \ No newline at end of file diff --git a/docstore/8fab6fa5-95a4-4f94-9270-0da107df4466 b/docstore/8fab6fa5-95a4-4f94-9270-0da107df4466 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/8fab6fa5-95a4-4f94-9270-0da107df4466 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/8faee5a0-7dc7-4247-b493-ad4f50efaecd b/docstore/8faee5a0-7dc7-4247-b493-ad4f50efaecd new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/8faee5a0-7dc7-4247-b493-ad4f50efaecd @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/8fb16155-ef1e-4906-8ad9-30d7f885d57d b/docstore/8fb16155-ef1e-4906-8ad9-30d7f885d57d new file mode 100644 index 0000000000000000000000000000000000000000..d718514486d8c4a5867b0240d223bf4fd2539a2c --- /dev/null +++ b/docstore/8fb16155-ef1e-4906-8ad9-30d7f885d57d @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/audio#inline-audio Title: Audio understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/8fb4cdce-5a60-4b26-a4dc-5a5f2673c099 b/docstore/8fb4cdce-5a60-4b26-a4dc-5a5f2673c099 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/8fb4cdce-5a60-4b26-a4dc-5a5f2673c099 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/8feadb0d-3f89-4d39-a212-ecf0dcc2f17f b/docstore/8feadb0d-3f89-4d39-a212-ecf0dcc2f17f new file mode 100644 index 0000000000000000000000000000000000000000..665a477ea8352b1598262b3124a473a18fa8289a --- /dev/null +++ b/docstore/8feadb0d-3f89-4d39-a212-ecf0dcc2f17f @@ -0,0 +1 @@ +professional, detailed The following are a few examples of prompts without quality modifiers and the same prompt with quality modifiers. Prompt (no quality modifiers): a photo of a corn stalk Prompt (with quality modifiers): 4k HDR beautiful photo of a corn stalk taken by a professional photographer Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Aspect ratios Imagen image generation lets you set five distinct image aspect ratios. Square (1:1, default) - A standard square photo. Common uses for this aspect ratio include social media posts. Fullscreen (4:3) - This aspect ratio is commonly used in media or film. It is also the dimensions of most old (non-widescreen) TVs and medium format cameras. It captures more of the scene horizontally (compared to 1:1), making it a preferred aspect ratio for photography. Prompt: close up of a musician's fingers playing the piano, black and white film, vintage (4:3 aspect ratio) Prompt: A professional studio photo of french fries for a high end restaurant, in the style of a food magazine (4:3 aspect ratio) Portrait full screen (3:4) - This is the fullscreen aspect ratio rotated 90 degrees. This lets to capture more of the scene vertically compared to the 1:1 aspect ratio. Prompt: a woman hiking, close of her boots reflected in a puddle, large mountains in the background, in the style of an advertisement, dramatic angles (3:4 aspect ratio) Prompt: aerial shot of a river flowing up a mystical valley (3:4 aspect ratio) Widescreen (16:9) - This ratio has replaced 4:3 and is now the most common aspect ratio for TVs, monitors, and mobile phone screens (landscape). Use this aspect ratio when you want to capture more of the background (for example, scenic landscapes). Prompt: a man wearing all white clothing sitting on the beach, close up, golden hour lighting (16:9 aspect ratio) Portrait (9:16) - This ratio is widescreen but rotated. This a relatively new aspect ratio that has been \ No newline at end of file diff --git a/docstore/903036cb-f546-4233-955f-a477146dcfe2 b/docstore/903036cb-f546-4233-955f-a477146dcfe2 new file mode 100644 index 0000000000000000000000000000000000000000..933572d214295ba3502f9da18dc8703a97691ed7 --- /dev/null +++ b/docstore/903036cb-f546-4233-955f-a477146dcfe2 @@ -0,0 +1 @@ +captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). * Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds, and their general characteristics. * Thesis Statement: State the main point of the essay, focusing on the remarkable adaptations, behaviors, and ecological significance that make hummingbirds extraordinary. ..... (gemini-2.5-flash) Zero-shot vs few-shot prompts You can include examples in the prompt that show the model what getting it right looks like. The model attempts to identify patterns and relationships from the examples and applies them when generating a response. Prompts that contain a few examples are called few-shot prompts, while prompts that provide no examples are called zero-shot prompts. Few-shot prompts are often used to regulate the formatting, phrasing, scoping, or general patterning of model responses. Use specific and varied examples to help the model narrow its focus and generate more accurate results. We recommend to always include few-shot examples in your prompts. Prompts without few-shot examples are likely to be less effective. In fact, you can remove instructions from your prompt if your examples are clear enough in showing the task at hand. The following zero-shot prompt asks the model to choose the best explanation. Prompt: Please choose the best explanation to the question: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Explanation1 is the better explanation because it provides more detail on the process, including how ice crystals combine and grow into snowflakes as they fall through the atmosphere. (gemini-2.5-flash) If your use case requires the \ No newline at end of file diff --git a/docstore/9052577b-8264-4eb4-86b1-188254d6c2ef b/docstore/9052577b-8264-4eb4-86b1-188254d6c2ef new file mode 100644 index 0000000000000000000000000000000000000000..dd1226540612f04fb3f971567b47c61067071189 --- /dev/null +++ b/docstore/9052577b-8264-4eb4-86b1-188254d6c2ef @@ -0,0 +1 @@ +This example shows you how to specify a subject description. Subject description Prompt Generated output The description can include a subject, or multiple subjects and actions. Here, our subject is "white concrete apartment building." An architectural rendering of a white concrete apartment building with flowing organic shapes, seamlessly blending with lush greenery and futuristic elements Context This example shows you how to specify context. Context Prompt Generated output The background or context in which the subject will be placed is very important. Try placing your subject in a variety of backgrounds like on a busy street, or in outer space. A satellite floating through outer space with the moon and some stars in the background. Action This example shows you how to specify action. Action Prompt Generated output What is the subject doing like walking, running, or turning their head. A wide shot of a woman walking along the beach, looking content and relaxed towards the horizon at sunset. Style This example shows you how to specify style. Style Prompt Generated output You can add keywords to improve generation quality and steer it closer to intended style, such as shallow depth of field, movie still, minimalistic, surreal, vintage, futuristic, or double-exposure. Film noir style, man and woman walk on the street, mystery, cinematic, black and white. Camera motion This example shows you how to specify camera motion. Camera motion Prompt Generated output Options for camera motion include POV shot, aerial view, tracking drone view, or tracking shot. A POV shot from a vintage car driving in the rain, Canada at night, cinematic. Composition This example shows you how to specify composition. Composition Prompt Generated output How the shot is framed (wide shot, close-up, low angle). Extreme close-up of a an eye with city reflected in it. Create a video of a wide shot of surfer walking on a beach with a surfboard, beautiful sunset, cinematic. Ambiance This example \ No newline at end of file diff --git a/docstore/90677f19-004e-4015-ac91-84368137a1bd b/docstore/90677f19-004e-4015-ac91-84368137a1bd new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/90677f19-004e-4015-ac91-84368137a1bd @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/906a44bf-2eb0-4f14-b630-23dab247f0fc b/docstore/906a44bf-2eb0-4f14-b630-23dab247f0fc new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/906a44bf-2eb0-4f14-b630-23dab247f0fc @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/9074df47-0f60-4e2c-af61-1f36a5ae9e8b b/docstore/9074df47-0f60-4e2c-af61-1f36a5ae9e8b new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/9074df47-0f60-4e2c-af61-1f36a5ae9e8b @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/909a32a8-519d-4975-9308-98cff2cae4e5 b/docstore/909a32a8-519d-4975-9308-98cff2cae4e5 new file mode 100644 index 0000000000000000000000000000000000000000..9bdceff10febb2c20e655c9c51c8df9e3aef858d --- /dev/null +++ b/docstore/909a32a8-519d-4975-9308-98cff2cae4e5 @@ -0,0 +1 @@ +Using different words or phrasing in your prompts often yields different responses from the model even though they all mean the same thing. If you're not getting the expected results from your prompt, try rephrasing it. Version 1: How do I bake a pie? Version 2: Suggest a recipe for a pie. Version 3: What's a good pie recipe? Switch to an analogous task: If you can't get the model to follow your instructions for a task, try giving it instructions for an analogous task that achieves the same result. This prompt tells the model to categorize a book by using predefined categories: Prompt: Which category does The Odyssey belong to: thriller sci-fi mythology biography Response: The Odyssey belongs to the category of **mythology**. Here's why: * **Mythology:** The Odyssey tells the story of Odysseus, a hero from Greek mythology, and his journey home after the Trojan War. It features gods, monsters, and supernatural events common to Greek mythology. ..... (gemini-2.5-flash) The response is correct, but the model didn't stay within the bounds of the options. You also want to model to just respond with one of the options instead of in a full sentence. In this case, you can rephrase the instructions as a multiple choice question and ask the model to choose an option. Prompt: Multiple choice problem: Which of the following options describes the book The Odyssey? Options: thriller sci-fi mythology biography Response: The correct answer is mythology . (gemini-2.5-flash) Change the order of prompt content: The order of the content in the prompt can sometimes affect the response. Try changing the content order and see how that affects the response. Version 1 : [ examples ] [ context ] [ input ] Version 2 : [ input ] [ examples ] [ context ] Version 3 : [ examples ] [ input ] [ context ] Fallback responses A fallback response is a response returned by the model when either the prompt or the response triggers a safety filter. An example of a fallback response is "I'm not able to \ No newline at end of file diff --git a/docstore/90f92079-4cac-450d-ace0-264c03732957 b/docstore/90f92079-4cac-450d-ace0-264c03732957 new file mode 100644 index 0000000000000000000000000000000000000000..44d10ad654e1ae877f525afe1fc1f8db1da83e76 --- /dev/null +++ b/docstore/90f92079-4cac-450d-ace0-264c03732957 @@ -0,0 +1 @@ +get_current_temperature ( location : str ) - > dict : """Gets the current temperature for a given location. Args: location: The city and state, e.g. San Francisco, CA Returns: A dictionary containing the temperature and unit. """ # ... (implementation) ... return { "temperature" : 25 , "unit" : "Celsius" } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_current_temperature ] ) # Pass the function itself # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What's the temperature in Boston?" , config = config , ) print ( response . text ) # The SDK handles the function call and returns the final text You can disable automatic function calling with: Python config = types . GenerateContentConfig ( tools = [ get_current_temperature ], automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ) ) Automatic function schema declaration Automatic schema extraction from Python functions doesn't work in all cases. For example, it doesn't handle cases where you describe the fields of a nested dictionary-object. The API is able to describe any of the following types: Python AllowedType = ( int | float | bool | str | list [ 'AllowedType' ] | dict [ str , AllowedType ]) To see what the inferred schema looks like, you can convert it using from_callable : Python def multiply ( a : float , b : float ): """Returns a * b.""" return a * b fn_decl = types . FunctionDeclaration . from_callable ( callable = multiply , client = client ) # to_json_dict() provides a clean JSON representation. print ( fn_decl . to_json_dict ()) Multi-tool use: Combine native tools with function calling You can enable multiple tools combining native tools with function calling at the same time. Here's an example that enables two tools, Grounding with Google Search and code execution , in a request using the Live API . Note: Multi-tool use is a- Live API only feature at the \ No newline at end of file diff --git a/docstore/9103fe5b-79e5-444a-b1bb-e4e639c231a8 b/docstore/9103fe5b-79e5-444a-b1bb-e4e639c231a8 new file mode 100644 index 0000000000000000000000000000000000000000..acef50f8402c486c348013efe146de15b88cb32b --- /dev/null +++ b/docstore/9103fe5b-79e5-444a-b1bb-e4e639c231a8 @@ -0,0 +1 @@ +Function calling with the Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Function calling with the Gemini API Function calling lets you connect models to external tools and APIs. Instead of generating text responses, the model determines when to call specific functions and provides the necessary parameters to execute real-world actions. This allows the model to act as a bridge between natural language and real-world actions and data. Function calling has 3 primary use cases: Augment Knowledge: Access information from external sources like databases, APIs, and knowledge bases. Extend Capabilities: Use external tools to perform computations and extend the limitations of the model, such as using a calculator or creating charts. Take Actions: Interact with external systems using APIs, such as scheduling appointments, creating invoices, sending emails, or controlling smart home devices. Get Weather Schedule Meeting Create Chart How function calling works Function calling involves a structured interaction between your application, the model, and external functions. Here's a breakdown of the process: Define Function Declaration: Define the function declaration in your application code. Function Declarations describe the function's name, parameters, and purpose to the model. Call LLM with function declarations: Send user prompt along with the function declaration(s) to the model. It analyzes the request and determines if a function call would be helpful. If so, it responds with a structured JSON object. Execute Function Code (Your Responsibility): The Model does not execute the function itself. It's your application's responsibility to \ No newline at end of file diff --git a/docstore/91184901-2a97-43bc-a8de-93dc8148c241 b/docstore/91184901-2a97-43bc-a8de-93dc8148c241 new file mode 100644 index 0000000000000000000000000000000000000000..23e2306067a8d9fd8cd77a2d432409752618662f --- /dev/null +++ b/docstore/91184901-2a97-43bc-a8de-93dc8148c241 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#available-models Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/9125c896-cd49-43f5-827d-d36f9d89c407 b/docstore/9125c896-cd49-43f5-827d-d36f9d89c407 new file mode 100644 index 0000000000000000000000000000000000000000..10a18346aa885d4b8d8c810e47dfc2e98890f23d --- /dev/null +++ b/docstore/9125c896-cd49-43f5-827d-d36f9d89c407 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/files#specific-instructions Title: Files API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/91390790-f592-4422-8589-89bfbeb49e50 b/docstore/91390790-f592-4422-8589-89bfbeb49e50 new file mode 100644 index 0000000000000000000000000000000000000000..c2369ca5049154f630fe926e06160c0364720f7c --- /dev/null +++ b/docstore/91390790-f592-4422-8589-89bfbeb49e50 @@ -0,0 +1 @@ +const turns = await handleTurn (); // Combine audio data strings and save as wave file const combinedAudio = turns . reduce (( acc , turn ) = > { if ( turn . data ) { const buffer = Buffer . from ( turn . data , 'base64' ); const intArray = new Int16Array ( buffer . buffer , buffer . byteOffset , buffer . byteLength / Int16Array . BYTES_PER_ELEMENT ); return acc . concat ( Array . from ( intArray )); } return acc ; }, []); const audioBuffer = new Int16Array ( combinedAudio ); const wf = new WaveFile (); wf . fromScratch ( 1 , 24000 , '16' , audioBuffer ); // output is 24kHz fs . writeFileSync ( 'audio.wav' , wf . toBuffer ()); session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); What's next Read the full Live API Capabilities guide for key capabilities and configurations; including Voice Activity Detection and native audio features. Read the Tool use guide to learn how to integrate Live API with tools and function calling. Read the Session management guide for managing long running conversations. Read the Ephemeral tokens guide for secure authentication in client-to-server applications. For more information about the underlying WebSockets API, see the WebSockets API reference . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/913abb3c-439f-4dc6-b3b9-769f3bae72c9 b/docstore/913abb3c-439f-4dc6-b3b9-769f3bae72c9 new file mode 100644 index 0000000000000000000000000000000000000000..8450328bfc49bb62b82cf9cb2ac9ca979a8f4eef --- /dev/null +++ b/docstore/913abb3c-439f-4dc6-b3b9-769f3bae72c9 @@ -0,0 +1 @@ +Python import asyncio import wave from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : wf = wave . open ( "audio.wav" , "wb" ) wf . setnchannels ( 1 ) wf . setsampwidth ( 2 ) wf . setframerate ( 24000 ) message = "Hello how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . data is not None : wf . writeframes ( response . data ) # Un-comment this code to print audio data info # if response.server_content.model_turn is not None: # print(response.server_content.model_turn.parts[0].inline_data.mime_type) wf . close () if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : \ No newline at end of file diff --git a/docstore/915fe27c-93a1-4af0-bada-22dee6474c9e b/docstore/915fe27c-93a1-4af0-bada-22dee6474c9e new file mode 100644 index 0000000000000000000000000000000000000000..ba9fb868f99e81b20779165b803150afffeabaec --- /dev/null +++ b/docstore/915fe27c-93a1-4af0-bada-22dee6474c9e @@ -0,0 +1 @@ +Generate video using Veo | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Generate video using Veo The Gemini API provides access to Veo 2 , Google's most capable video generation model to date. Veo generates videos in a wide range of cinematic and visual styles, capturing prompt nuance to render intricate details consistently across frames. This guide will help you get started with Veo using the Gemini API. For video prompting guidance, check out the Veo prompt guide section. Note: Veo is a paid feature and will not run in the Free tier. Visit the Pricing page for more details. Before you begin Before calling the Gemini API, ensure you have your SDK of choice installed, and a Gemini API key configured and ready to use. To use Veo with the Google Gen AI SDKs, ensure that you have one of the following versions installed: Python v1.10.0 or later TypeScript and JavaScript v0.8.0 or later Go v1.0.0 or later Generate videos This section provides code examples for generating videos using text prompts and using images . Generate from text You can use the following code to generate videos with Veo: Python import time from google import genai from google.genai import types client = genai . Client () operation = client . models . generate_videos ( model = "veo-2.0-generate-001" , prompt = "Panning wide shot of a calico kitten sleeping in the sunshine" , config = types . GenerateVideosConfig ( person_generation = "dont_allow" , # "dont_allow" or "allow_adult" aspect_ratio = "16:9" , # "16:9" or "9:16" ), ) while not operation . done : time . sleep ( 20 ) operation = client . operations . get ( operation ) for n , generated_video in enumerate ( \ No newline at end of file diff --git a/docstore/917ee816-7fce-4faf-99b9-576aea161291 b/docstore/917ee816-7fce-4faf-99b9-576aea161291 new file mode 100644 index 0000000000000000000000000000000000000000..38b0d511edf99b6e46520aefe0a8854155d4053f --- /dev/null +++ b/docstore/917ee816-7fce-4faf-99b9-576aea161291 @@ -0,0 +1 @@ +energetic music, and dimmed the lights to 50% brightness. Let's get this party started! Compositional function calling Compositional or sequential function calling allows Gemini to chain multiple function calls together to fulfill a complex request. For example, to answer "Get the temperature in my current location", the Gemini API might first invoke a get_current_location() function followed by a get_weather() function that takes the location as a parameter. The following example demonstrates how to implement compositional function calling using the Python SDK and automatic function calling. Python This example uses the automatic function calling feature of the google-genai Python SDK. The SDK automatically converts the Python functions to the required schema, executes the function calls when requested by the model, and sends the results back to the model to complete the task. import os from google import genai from google.genai import types # Example Functions def get_weather_forecast ( location : str ) - > dict : """Gets the current weather temperature for a given location.""" print ( f "Tool Call: get_weather_forecast(location= { location } )" ) # TODO: Make API call print ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ) return { "temperature" : 25 , "unit" : "celsius" } # Dummy response def set_thermostat_temperature ( temperature : int ) - > dict : """Sets the thermostat to a desired temperature.""" print ( f "Tool Call: set_thermostat_temperature(temperature= { temperature } )" ) # TODO: Interact with a thermostat API print ( "Tool Response: {'status': 'success'}" ) return { "status" : "success" } # Configure the client and model client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_weather_forecast , set_thermostat_temperature ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to \ No newline at end of file diff --git a/docstore/9189cbba-8ccc-4562-96ad-fc5d0e81ab58 b/docstore/9189cbba-8ccc-4562-96ad-fc5d0e81ab58 new file mode 100644 index 0000000000000000000000000000000000000000..05313682a3c7d368ba238c3255ba9cc1f0488136 --- /dev/null +++ b/docstore/9189cbba-8ccc-4562-96ad-fc5d0e81ab58 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models#gemini-2.0-flash-lite Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/9198c6ad-c752-4481-b8dc-ea66c8a7c2d5 b/docstore/9198c6ad-c752-4481-b8dc-ea66c8a7c2d5 new file mode 100644 index 0000000000000000000000000000000000000000..e83e80a217e9ff779ee42e75d11ac96b01a2f185 --- /dev/null +++ b/docstore/9198c6ad-c752-4481-b8dc-ea66c8a7c2d5 @@ -0,0 +1 @@ +function_calling_config = types . FunctionCallingConfig ( mode = "ANY" , allowed_function_names = [ "get_current_temperature" ] ) ) # Create the generation config config = types . GenerateContentConfig ( tools = [ tools ], # not defined here. tool_config = tool_config , ) JavaScript import { FunctionCallingConfigMode } from '@google/genai' ; // Configure function calling mode const toolConfig = { functionCallingConfig : { mode : FunctionCallingConfigMode . ANY , allowedFunctionNames : [ 'get_current_temperature' ] } }; // Create the generation config const config = { tools : tools , // not defined here. toolConfig : toolConfig , }; Automatic function calling (Python only) When using the Python SDK, you can provide Python functions directly as tools. The SDK automatically converts the Python function to declarations, handles the function call execution and the response cycle for you. The Python SDK then automatically: Detects function call responses from the model. Call the corresponding Python function in your code. Sends the function response back to the model. Returns the model's final text response. To use this, define your function with type hints and a docstring, and then pass the function itself (not a JSON declaration) as a tool: Python from google import genai from google.genai import types # Define the function with type hints and docstring def get_current_temperature ( location : str ) - > dict : """Gets the current temperature for a given location. Args: location: The city and state, e.g. San Francisco, CA Returns: A dictionary containing the temperature and unit. """ # ... (implementation) ... return { "temperature" : 25 , "unit" : "Celsius" } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_current_temperature ] ) # Pass the function itself # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What's the temperature in Boston?" , config = config \ No newline at end of file diff --git a/docstore/91d1214c-0103-4e2d-8dfe-082ebd21ebf6 b/docstore/91d1214c-0103-4e2d-8dfe-082ebd21ebf6 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/91d1214c-0103-4e2d-8dfe-082ebd21ebf6 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/91d84430-e6c2-47d7-88a2-d0a49aeb3e61 b/docstore/91d84430-e6c2-47d7-88a2-d0a49aeb3e61 new file mode 100644 index 0000000000000000000000000000000000000000..4698c2cf5d2dc524303259a813fe032a26136eee --- /dev/null +++ b/docstore/91d84430-e6c2-47d7-88a2-d0a49aeb3e61 @@ -0,0 +1 @@ +blurring the background into a sea of neon colors and indistinct shadows, creating a sense of urgency and isolation. A more detailed prompt results in a video that is more focused with a richer environment. A video with smooth motion that dollies in on a desperate man in a green trench coat, using a vintage rotary phone against a wall bathed in an eerie green neon glow. The camera starts from a medium distance, slowly moving closer to the man's face, revealing his frantic expression and the sweat on his brow as he urgently dials the phone. The focus is on the man's hands, his fingers fumbling with the dial as he desperately tries to connect. The green neon light casts long shadows on the wall, adding to the tense atmosphere. The scene is framed to emphasize the isolation and desperation of the man, highlighting the stark contrast between the vibrant glow of the neon and the man's grim determination. Adding more detail gives the subject a realistic expression and creates an intense and vibrant scene. Snow leopard This example demonstrates the output Veo might generate for a simple prompt. Prompt Generated output A cute creature with snow leopard-like fur is walking in winter forest, 3D cartoon style render. Running snow leopard This prompt has more detail and demonstrates generated output that might be closer to what you want in your video. Prompt Generated output Create a short 3D animated scene in a joyful cartoon style. A cute creature with snow leopard-like fur, large expressive eyes, and a friendly, rounded form happily prances through a whimsical winter forest. The scene should feature rounded, snow-covered trees, gentle falling snowflakes, and warm sunlight filtering through the branches. The creature's bouncy movements and wide smile should convey pure delight. Aim for an upbeat, heartwarming tone with bright, cheerful colors and playful animation. Examples by writing elements These examples show you how to refine your prompts by each basic element. Subject \ No newline at end of file diff --git a/docstore/91ee3efc-f1c5-4598-b783-1c42a22304cc b/docstore/91ee3efc-f1c5-4598-b783-1c42a22304cc new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/91ee3efc-f1c5-4598-b783-1c42a22304cc @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/91fba482-20fa-4e28-af75-2c6b36fb1f73 b/docstore/91fba482-20fa-4e28-af75-2c6b36fb1f73 new file mode 100644 index 0000000000000000000000000000000000000000..acef50f8402c486c348013efe146de15b88cb32b --- /dev/null +++ b/docstore/91fba482-20fa-4e28-af75-2c6b36fb1f73 @@ -0,0 +1 @@ +Function calling with the Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Function calling with the Gemini API Function calling lets you connect models to external tools and APIs. Instead of generating text responses, the model determines when to call specific functions and provides the necessary parameters to execute real-world actions. This allows the model to act as a bridge between natural language and real-world actions and data. Function calling has 3 primary use cases: Augment Knowledge: Access information from external sources like databases, APIs, and knowledge bases. Extend Capabilities: Use external tools to perform computations and extend the limitations of the model, such as using a calculator or creating charts. Take Actions: Interact with external systems using APIs, such as scheduling appointments, creating invoices, sending emails, or controlling smart home devices. Get Weather Schedule Meeting Create Chart How function calling works Function calling involves a structured interaction between your application, the model, and external functions. Here's a breakdown of the process: Define Function Declaration: Define the function declaration in your application code. Function Declarations describe the function's name, parameters, and purpose to the model. Call LLM with function declarations: Send user prompt along with the function declaration(s) to the model. It analyzes the request and determines if a function call would be helpful. If so, it responds with a structured JSON object. Execute Function Code (Your Responsibility): The Model does not execute the function itself. It's your application's responsibility to \ No newline at end of file diff --git a/docstore/921357f2-224d-4227-96bf-f04eb2956604 b/docstore/921357f2-224d-4227-96bf-f04eb2956604 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/921357f2-224d-4227-96bf-f04eb2956604 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/92908fe0-2d15-453e-b2ed-581494b82ebb b/docstore/92908fe0-2d15-453e-b2ed-581494b82ebb new file mode 100644 index 0000000000000000000000000000000000000000..0a87c36dd70c54ff062f2d96388cf5344fb768a1 --- /dev/null +++ b/docstore/92908fe0-2d15-453e-b2ed-581494b82ebb @@ -0,0 +1 @@ +process the response and check for Function Call, if Yes : Extract the name and args of the function and execute the corresponding function in your application. No: The model has provided a direct text response to the prompt (this flow is less emphasized in the example but is a possible outcome). Create User friendly response: If a function was executed, capture the result and send it back to the model in a subsequent turn of the conversation. It will use the result to generate a final, user-friendly response that incorporates the information from the function call. This process can be repeated over multiple turns, allowing for complex interactions and workflows. The model also supports calling multiple functions in a single turn ( parallel function calling ) and in sequence ( compositional function calling ). Step 1: Define a function declaration Define a function and its declaration within your application code that allows users to set light values and make an API request. This function could call external services or APIs. Python # Define a function that the model can call to control smart lights set_light_values_declaration = { "name" : "set_light_values" , "description" : "Sets the brightness and color temperature of a light." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "integer" , "description" : "Light level from 0 to 100. Zero is off and 100 is full brightness" , }, "color_temp" : { "type" : "string" , "enum" : [ "daylight" , "cool" , "warm" ], "description" : "Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`." , }, }, "required" : [ "brightness" , "color_temp" ], }, } # This is the actual function that would be called based on the model's suggestion def set_light_values ( brightness : int , color_temp : str ) - > dict [ str , int | str ]: """Set the brightness and color temperature of a room light. (mock API). Args: brightness: Light level from 0 to 100. Zero is off and 100 is full \ No newline at end of file diff --git a/docstore/92cb4c7f-5d83-4e95-b95a-7eefe9a4570f b/docstore/92cb4c7f-5d83-4e95-b95a-7eefe9a4570f new file mode 100644 index 0000000000000000000000000000000000000000..9670cc28c51e28ab1129c38367a73e00f33ff60c --- /dev/null +++ b/docstore/92cb4c7f-5d83-4e95-b95a-7eefe9a4570f @@ -0,0 +1 @@ +meal features brown rice, roasted vegetables, and chicken teriyaki. The brown rice is a healthy and complex carbohydrate that will give you sustained energy. The roasted vegetables are a great way to get your daily dose of vitamins and minerals, and the chicken teriyaki is a delicious and protein-rich option. This meal is also very easy to prepare. Simply cook the brown rice, roast the vegetables, and cook the chicken teriyaki. Once everything is cooked, divide it into meal prep containers and store them in the refrigerator. You can then grab a container and go on busy mornings! If you're looking for a healthy and delicious meal that can be easily prepped ahead of time, this meal is a great option. It's packed with nutrients and flavor, and it's sure to keep you feeling full and satisfied. Here's to healthy and delicious meal prepping! If you are having trouble getting the output you want from prompts that use media files, there are some strategies that can help you get the results you want. The following sections provide design approaches and troubleshooting tips for improving prompts that use multimodal input. You can improve your multimodal prompts by following these best practices: Prompt design fundamentals Be specific in your instructions : Craft clear and concise instructions that leave minimal room for misinterpretation. Add a few examples to your prompt: Use realistic few-shot examples to illustrate what you want to achieve. Break it down step-by-step : Divide complex tasks into manageable sub-goals, guiding the model through the process. Specify the output format : In your prompt, ask for the output to be in the format you want, like markdown, JSON, HTML and more. Put your image first for single-image prompts : While Gemini can handle image and text inputs in any order, for prompts containing a single image, it might perform better if that image (or video) is placed before the text prompt. However, for prompts that require images to be highly interleaved \ No newline at end of file diff --git a/docstore/92d01027-f3eb-4667-9aba-d05536934efc b/docstore/92d01027-f3eb-4667-9aba-d05536934efc new file mode 100644 index 0000000000000000000000000000000000000000..4ce25c45c0956235a2a76b8ce578fbaaad6010c8 --- /dev/null +++ b/docstore/92d01027-f3eb-4667-9aba-d05536934efc @@ -0,0 +1 @@ +clip" , ]), }); console . log ( response . text ); } await main (); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } defer client . DeleteFile ( ctx , file . Name ) model := client . GenerativeModel ( "gemini-2.0-flash" ) resp , err := model . GenerateContent ( ctx , genai . FileData { URI : file . URI }, genai . Text ( "Describe this audio clip" )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) REST AUDIO_PATH = "path/to/sample.mp3" MIME_TYPE = $( file -b --mime-type " ${ AUDIO_PATH } " ) NUM_BYTES = $( wc -c < " ${ AUDIO_PATH } " ) DISPLAY_NAME = AUDIO tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D " ${ tmp_header_file } " \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ AUDIO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Describe this audio clip"}, {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": \ No newline at end of file diff --git a/docstore/92d5693c-738f-4036-ba22-3a6920627d0d b/docstore/92d5693c-738f-4036-ba22-3a6920627d0d new file mode 100644 index 0000000000000000000000000000000000000000..fc6b27785ffc2bef06b5f2067401216e3ff34ff1 --- /dev/null +++ b/docstore/92d5693c-738f-4036-ba22-3a6920627d0d @@ -0,0 +1 @@ +"cheeseburger" was excluded from the output because it wasn't a part of the order. Constraints Specify any constraints on reading the prompt or generating a response. You can tell the model what to do and not to do. For example, you can specify a constraint in the prompt on how long you want a summary to be: Prompt: Summarize this text in one sentence: Text: A quantum computer exploits quantum mechanical phenomena to perform calculations exponentially faster than any modern traditional computer. At very tiny scales, physical matter acts as both particles and as waves, and quantum computing uses specialized hardware to leverage this behavior. The operating principles of quantum devices is beyond the scope of classical physics. When deployed at scale, quantum computers could be used in a wide variety of applications such as: in cybersecurity to break existing encryption methods while helping researchers create new ones, in meteorology to develop better weather forecasting etc. However, the current state of the art quantum computers are still largely experimental and impractical. Response: Exploiting quantum mechanical phenomena, quantum computers can perform calculations exponentially faster than traditional computers for potential applications like cybersecurity and meteorology, although they are currently largely experimental and impractical. (gemini-2.5-flash) Response format You can give instructions that specify the format of the response. For example, you can ask for the response to be formatted as a table, bulleted list, elevator pitch, keywords, sentence, or paragraph. The following system instruction tells the model to be more conversational in its response: System instruction All questions should be answered comprehensively with details, unless the user requests a concise response specifically. Prompt What is a smart way to make a business that sells DVD's in 2025? Response: Starting a business selling DVDs in 2025 presents significant challenges due to the \ No newline at end of file diff --git a/docstore/92dde4d9-791b-4061-b2aa-c885feb169e8 b/docstore/92dde4d9-791b-4061-b2aa-c885feb169e8 new file mode 100644 index 0000000000000000000000000000000000000000..53d3e426b4ff6e145f59bebdb86773397956de24 --- /dev/null +++ b/docstore/92dde4d9-791b-4061-b2aa-c885feb169e8 @@ -0,0 +1 @@ +field responseJsonSchema which accepts any JSON Schema with the following limitations: It only works with Gemini 2.5. While all JSON Schema properties can be passed, not all are supported. See the documentation for the field for more details. Recursive references can only be used as the value of a non-required object property. Recursive references are unrolled to a finite degree, based on the size of the schema. Schemas that contain $ref cannot contain any properties other than those starting with a $ . Here's an example of generating a JSON Schema with Pydantic and submitting it to the model: curl "https://generativelanguage.googleapis.com/v1alpha/models/\ gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d @- < /dev/null Note: Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly. Uploading images using the File API For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to generateContent . See the Files API guide for more information and examples. Python from google import genai client = genai . Client () my_file = client . files . upload ( file = "path/to/sample.jpg" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ my_file , "Caption this image." ], ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.jpg" , config : { mimeType : "image/jpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Caption this image." , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.jpg" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Caption this image." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * \ No newline at end of file diff --git a/docstore/930ff927-7ff3-41c6-9767-e611de144faa b/docstore/930ff927-7ff3-41c6-9767-e611de144faa new file mode 100644 index 0000000000000000000000000000000000000000..3be596a615ed567f90abc0547a663aefd35b8061 --- /dev/null +++ b/docstore/930ff927-7ff3-41c6-9767-e611de144faa @@ -0,0 +1 @@ +:{}, 'temperature' : 0.7 , 'response_modalities' :[ 'TEXT' ] } }, 'http_options' : { 'api_version' : 'v1alpha' }, } ) # You'll need to pass the value under token.name back to your client to use it JavaScript import { GoogleGenAI } from "@google/genai" ; const client = new GoogleGenAI ({}); const expireTime = new Date ( Date . now () + 30 * 60 * 1000 ). toISOString (); const token = await client . authTokens . create ({ config : { uses : 1 , // The default expireTime : expireTime , liveConnectConstraints : { model : 'gemini-2.0-flash-live-001' , config : { sessionResumption : {}, temperature : 0.7 , responseModalities : [ 'TEXT' ] } }, httpOptions : { apiVersion : 'v1alpha' } } }); // You'll need to pass the value under token.name back to your client to use it You can also lock a subset of fields, see the SDK documentation for more info. Connect to Live API with an ephemeral token Once you have an ephemeral token, you use it as if it were an API key (but remember, it only works for the live API, and only with the v1alpha version of the API). Note that use of ephemeral tokens only adds value when deploying applications that follow client-to-server implementation approach. JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; // Use the token generated in the "Create an ephemeral token" section here const ai = new GoogleGenAI ({ apiKey : token . name }); const model = 'gemini-2.0-flash-live-001' ; const config = { responseModalities : [ Modality . TEXT ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : { ... }, }); // Send content... session . close (); } main (); Note: If not using the SDK, note that ephemeral tokens must either be passed in an access_token query parameter, or in an HTTP Authorization prefixed by the auth-scheme Token . See Get started with Live API for more examples. Best practices Set a short expiration duration using the expire_time parameter. Tokens expire, \ No newline at end of file diff --git a/docstore/933dfa07-3339-470d-92c3-6f8c93d4cc1e b/docstore/933dfa07-3339-470d-92c3-6f8c93d4cc1e new file mode 100644 index 0000000000000000000000000000000000000000..a93f111d87cca3d0226fc4cdf96775703aa03950 --- /dev/null +++ b/docstore/933dfa07-3339-470d-92c3-6f8c93d4cc1e @@ -0,0 +1 @@ +genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMAGE_PATH = "path/to/sample.jpg" MIME_TYPE = $( file -b --mime-type " ${ IMAGE_PATH } " ) NUM_BYTES = $( wc -c < " ${ IMAGE_PATH } " ) DISPLAY_NAME = IMAGE tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Caption this image."}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Prompting with multiple images You can provide multiple images in a \ No newline at end of file diff --git a/docstore/935a76cb-cffc-4282-8727-442b47480dd5 b/docstore/935a76cb-cffc-4282-8727-442b47480dd5 new file mode 100644 index 0000000000000000000000000000000000000000..c3dd5ccab4f70d7a2a8a4ffb78dc41dee62c1a3f --- /dev/null +++ b/docstore/935a76cb-cffc-4282-8727-442b47480dd5 @@ -0,0 +1 @@ +with texts to make sense, use whatever order is most natural. Troubleshooting your multimodal prompt If the model is not drawing information from the relevant part of the image: Drop hints with which aspects of the image you want the prompt to draw information from. If the model output is too generic (not tailored enough to the image/video input): At the start of the prompt, try asking the model to describe the image(s) or video before providing the task instruction, or try asking the model to refer to what's in the image. To troubleshoot which part failed: Ask the model to describe the image, or ask the model to explain its reasoning, to gauge the model's initial understanding. If your prompt results in hallucinated content: Try dialing down the temperature setting or asking the model for shorter descriptions so that it's less likely to extrapolate additional details. Tuning the sampling parameters: Experiment with different temperature settings and top-k selections to adjust the model's creativity. Be specific in your instructions Prompts have the most success when they are clear and detailed. If you have a specific output in mind, it's better to include that requirement in the prompt to ensure you get the output you want. For this image of an airport board, asking the model to just "describe this image" could generate a general description. If you need the model to parse the time and city from the image, you can include that request directly in your prompt. Prompt Model response Describe this image. The image shows an airport arrivals and departures board. Updated prompt Improved response Parse the time and city from the airport board shown in this image into a list. 10:50 Moscow 11:05 Edinburgh 11:05 London 11:10 Bucharest 11:30 Kiev 11:35 Dublin 11:45 East Midlands 12:15 Sofia 12:30 London 12:30 Newcastle 12:40 St Petersburg 12:40 London 12:45 Manchester Add a few examples The Gemini model can accept multiple inputs which it can use as examples to understand \ No newline at end of file diff --git a/docstore/938c6cc4-bc40-4326-95be-4245ea3688b1 b/docstore/938c6cc4-bc40-4326-95be-4245ea3688b1 new file mode 100644 index 0000000000000000000000000000000000000000..2e8ac78d46cd543ec267658a40dc30e0a2feb077 --- /dev/null +++ b/docstore/938c6cc4-bc40-4326-95be-4245ea3688b1 @@ -0,0 +1 @@ +Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { SystemInstruction : genai . NewContentFromText ( "You are a cat. Your name is Neko." , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Hello there" ), config , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "system_instruction": { "parts": [ { "text": "You are a cat. Your name is Neko." } ] }, "contents": [ { "parts": [ { "text": "Hello there" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const systemInstruction = { parts : [{ text : 'You are a cat. Your name is Neko.' }] }; const payload = { systemInstruction , contents : [ { parts : [ { text : 'Hello there' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } The GenerateContentConfig object also lets you override default generation parameters, such as temperature . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model \ No newline at end of file diff --git a/docstore/93a14e1a-f4f1-44b5-bcf3-18e56d8e6217 b/docstore/93a14e1a-f4f1-44b5-bcf3-18e56d8e6217 new file mode 100644 index 0000000000000000000000000000000000000000..b56dfbca2298dddb94eb66ba2a6b5e2a4d7109fa --- /dev/null +++ b/docstore/93a14e1a-f4f1-44b5-bcf3-18e56d8e6217 @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "' " ${ MIME1_TYPE } " '", "file_uri": ' $file1_uri '}}, { "inline_data": { "mime_type":"' " ${ MIME2_TYPE } " '", "data": "' " $IMAGE2_BASE64 " '" } } ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Object detection From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. Python from google import genai from google.genai import types from PIL import Image import json client = genai . Client () prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image . open ( "/path/to/image.png" ) config = types . GenerateContentConfig ( response_mime_type = "application/json" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , prompt ], config = config ) width , height = image . size bounding_boxes = json . loads ( response . text ) converted_bounding_boxes = [] for bounding_box in bounding_boxes : abs_y1 = int ( bounding_box [ "box_2d" ][ 0 ] / 1000 * height ) abs_x1 = int ( bounding_box [ "box_2d" ][ 1 ] / 1000 * width ) abs_y2 = int ( bounding_box [ "box_2d" ][ 2 ] / 1000 * height ) abs_x2 = int ( bounding_box [ "box_2d" ][ 3 ] / 1000 * width ) converted_bounding_boxes . append ([ abs_x1 , abs_y1 , abs_x2 , abs_y2 ]) print ( "Image size: " , width , height ) print ( "Bounding boxes:" , converted_bounding_boxes ) Note: The model also supports generating bounding boxes based on custom \ No newline at end of file diff --git a/docstore/93a205aa-3fd9-4e80-9cf5-031e24968be7 b/docstore/93a205aa-3fd9-4e80-9cf5-031e24968be7 new file mode 100644 index 0000000000000000000000000000000000000000..0a87c36dd70c54ff062f2d96388cf5344fb768a1 --- /dev/null +++ b/docstore/93a205aa-3fd9-4e80-9cf5-031e24968be7 @@ -0,0 +1 @@ +process the response and check for Function Call, if Yes : Extract the name and args of the function and execute the corresponding function in your application. No: The model has provided a direct text response to the prompt (this flow is less emphasized in the example but is a possible outcome). Create User friendly response: If a function was executed, capture the result and send it back to the model in a subsequent turn of the conversation. It will use the result to generate a final, user-friendly response that incorporates the information from the function call. This process can be repeated over multiple turns, allowing for complex interactions and workflows. The model also supports calling multiple functions in a single turn ( parallel function calling ) and in sequence ( compositional function calling ). Step 1: Define a function declaration Define a function and its declaration within your application code that allows users to set light values and make an API request. This function could call external services or APIs. Python # Define a function that the model can call to control smart lights set_light_values_declaration = { "name" : "set_light_values" , "description" : "Sets the brightness and color temperature of a light." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "integer" , "description" : "Light level from 0 to 100. Zero is off and 100 is full brightness" , }, "color_temp" : { "type" : "string" , "enum" : [ "daylight" , "cool" , "warm" ], "description" : "Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`." , }, }, "required" : [ "brightness" , "color_temp" ], }, } # This is the actual function that would be called based on the model's suggestion def set_light_values ( brightness : int , color_temp : str ) - > dict [ str , int | str ]: """Set the brightness and color temperature of a room light. (mock API). Args: brightness: Light level from 0 to 100. Zero is off and 100 is full \ No newline at end of file diff --git a/docstore/93aa02d1-fdd7-4329-ba01-c0eff4a0c232 b/docstore/93aa02d1-fdd7-4329-ba01-c0eff4a0c232 new file mode 100644 index 0000000000000000000000000000000000000000..6d0fb3dda30537d366b10ce141412a20984aa796 --- /dev/null +++ b/docstore/93aa02d1-fdd7-4329-ba01-c0eff4a0c232 @@ -0,0 +1 @@ +where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels. """ config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # set thinking_budget to 0 for better results in object detection ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ prompt , im ], # Pillow images can be directly passed as inputs (which will be converted by the SDK) config = config ) # Parse JSON response items = json . loads ( parse_json ( response . text )) # Create output directory os . makedirs ( output_dir , exist_ok = True ) # Process each mask for i , item in enumerate ( items ): # Get bounding box coordinates box = item [ "box_2d" ] y0 = int ( box [ 0 ] / 1000 * im . size [ 1 ]) x0 = int ( box [ 1 ] / 1000 * im . size [ 0 ]) y1 = int ( box [ 2 ] / 1000 * im . size [ 1 ]) x1 = int ( box [ 3 ] / 1000 * im . size [ 0 ]) # Skip invalid boxes if y0 > = y1 or x0 > = x1 : continue # Process mask png_str = item [ "mask" ] if not png_str . startswith ( "data:image/png;base64," ): continue # Remove prefix png_str = png_str . removeprefix ( "data:image/png;base64," ) mask_data = base64 . b64decode ( png_str ) mask = Image . open ( io . BytesIO ( mask_data )) # Resize mask to match bounding box mask = mask . resize (( x1 - x0 , y1 - y0 ), Image . Resampling . BILINEAR ) # Convert mask to numpy array for processing mask_array = np . array ( mask ) # Create overlay for this mask overlay = Image . new ( 'RGBA' , im . size , ( 0 , 0 , 0 , 0 )) overlay_draw = ImageDraw . Draw ( overlay ) # Create overlay for the mask color = ( 255 , 255 , 255 , 200 ) for y in range ( y0 , y1 ): for x in range ( x0 , x1 ): if mask_array [ y - y0 , x - x0 ] > 128 : # Threshold for mask overlay_draw . point (( x , y ), fill = color ) # Save individual mask and its overlay mask_filename = f " { item [ 'label' ] } _ { i } _mask.png" \ No newline at end of file diff --git a/docstore/93acf857-fc4b-4702-b36a-0a7f99278b26 b/docstore/93acf857-fc4b-4702-b36a-0a7f99278b26 new file mode 100644 index 0000000000000000000000000000000000000000..10d595bd2c735f8912abb00e69220b9ae90d3d23 --- /dev/null +++ b/docstore/93acf857-fc4b-4702-b36a-0a7f99278b26 @@ -0,0 +1 @@ +Audio understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Audio understanding Gemini can analyze and understand audio input, enabling use cases like the following: Describe, summarize, or answer questions about audio content. Provide a transcription of the audio. Analyze specific segments of the audio. This guide shows you how to use the Gemini API to generate a text response to audio input. Before you begin Before calling the Gemini API, ensure you have your SDK of choice installed, and a Gemini API key configured and ready to use. Input audio You can provide audio data to Gemini in the following ways: Upload an audio file before making a request to generateContent . Pass inline audio data with the request to generateContent . Upload an audio file You can use the Files API to upload an audio file. Always use the Files API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20 MB. The following code uploads an audio file and then uses the file in a call to generateContent . Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp3" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "Describe this audio clip" , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mp3" }, }); const response = await ai . models . generateContent ({ \ No newline at end of file diff --git a/docstore/93ad551d-ef80-4138-a9e3-37cd2082f8e8 b/docstore/93ad551d-ef80-4138-a9e3-37cd2082f8e8 new file mode 100644 index 0000000000000000000000000000000000000000..03e316e434a13c2a6804dc8cff96f196f07c7e52 --- /dev/null +++ b/docstore/93ad551d-ef80-4138-a9e3-37cd2082f8e8 @@ -0,0 +1 @@ +the sum of the first 50 prime numbers. Here's how I'll approach this: 1. **Generate Prime Numbers:** I'll use an iterative method to find prime numbers. I'll start with 2 and check if each subsequent number is divisible by any number between 2 and its square root. If not, it's a prime. 2. **Store Primes:** I'll store the prime numbers in a list until I have 50 of them. 3. **Calculate the Sum:** Finally, I'll sum the prime numbers in the list. Here's the Python code to do this: def is_prime(n): """Efficiently checks if a number is prime.""" if n <= 1: return False if n <= 3: return True if n % 2 == 0 or n % 3 == 0: return False i = 5 while i * i <= n: if n % i == 0 or n % (i + 2) == 0: return False i += 6 return True primes = [] num = 2 while len(primes) < 50: if is_prime(num): primes.append(num) num += 1 sum_of_primes = sum(primes) print(f'{primes=}') print(f'{sum_of_primes=}') primes=[2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229] sum_of_primes=5117 The sum of the first 50 prime numbers is 5117. This output combines several content parts that the model returns when using code execution: text : Inline text generated by the model executableCode : Code generated by the model that is meant to be executed codeExecutionResult : Result of the executable code The naming conventions for these parts vary by programming language. Use code execution in chat You can also use code execution as part of a chat. Python from google import genai from google.genai import types client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" , config = types . GenerateContentConfig ( tools = [ types . Tool ( code_execution = types . ToolCodeExecution )] ), ) response = chat . send_message ( "I have a math question for you." ) print ( response . text ) response = chat . send_message ( "What is \ No newline at end of file diff --git a/docstore/93af2fc5-1e96-4050-a2ce-4b7a678ca71f b/docstore/93af2fc5-1e96-4050-a2ce-4b7a678ca71f new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/93af2fc5-1e96-4050-a2ce-4b7a678ca71f @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/93be0867-d67b-4afc-860b-e91e7dace985 b/docstore/93be0867-d67b-4afc-860b-e91e7dace985 new file mode 100644 index 0000000000000000000000000000000000000000..fdea6397d0ee0c5ce13453eceb7f458532b87688 --- /dev/null +++ b/docstore/93be0867-d67b-4afc-860b-e91e7dace985 @@ -0,0 +1 @@ +"BLOCK_MEDIUM_AND_ABOVE"} ], "contents": [{ "parts":[{ "text": "' I support Martians Soccer Club and I think Jupiterians Football Club sucks! Write a ironic phrase about them. '"}]}]}' > request.json curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d @request.json 2 > /dev/null Next steps See the API reference to learn more about the full API. Review the safety guidance for a general look at safety considerations when developing with LLMs. Learn more about assessing probability versus severity from the Jigsaw team Learn more about the products that contribute to safety solutions like the Perspective API . * You can use these safety settings to create a toxicity classifier. See the classification example to get started. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/93cadaa2-757a-4a66-b724-1fb93520d78c b/docstore/93cadaa2-757a-4a66-b724-1fb93520d78c new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/93cadaa2-757a-4a66-b724-1fb93520d78c @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/93cba0e6-10cc-4072-b930-d015caabe1a4 b/docstore/93cba0e6-10cc-4072-b930-d015caabe1a4 new file mode 100644 index 0000000000000000000000000000000000000000..65512c01f777d6e5e3b37a55514cc97b01015837 --- /dev/null +++ b/docstore/93cba0e6-10cc-4072-b930-d015caabe1a4 @@ -0,0 +1 @@ +Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Affective dialog This feature lets Gemini adapt its response style to the input expression and tone. To use affective dialog, set the api version to v1alpha and set enable_affective_dialog to true in the setup message: Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], enable_affective_dialog = True ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], enableAffectiveDialog : true }; Note that affective dialog is currently only supported by the native audio output models. Proactive audio When this feature is enabled, Gemini can proactively decide not to respond if the content is not relevant. To use it, set the api version to v1alpha and configure the proactivity field in the setup message and set proactive_audio to true : Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], proactivity = { 'proactive_audio' : True } ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], proactivity : { proactiveAudio : true } } Note that proactive audio is currently only supported by the native audio output models. Native audio output with thinking Native audio output supports thinking capabilities , available via a separate model gemini-2.5-flash-exp-native-audio-thinking-dialog . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-exp-native-audio-thinking-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio \ No newline at end of file diff --git a/docstore/93e4b530-7486-4d17-a950-7952a0b9e93c b/docstore/93e4b530-7486-4d17-a950-7952a0b9e93c new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/93e4b530-7486-4d17-a950-7952a0b9e93c @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/93f7037d-c299-4579-ab8a-f9eabc2aa79e b/docstore/93f7037d-c299-4579-ab8a-f9eabc2aa79e new file mode 100644 index 0000000000000000000000000000000000000000..122c682d2774097387ed4735af082d43f98d76f5 --- /dev/null +++ b/docstore/93f7037d-c299-4579-ab8a-f9eabc2aa79e @@ -0,0 +1 @@ +regions. Can I use 1M tokens in the free tier? The free tier for Gemini API differs based on the model selected. For now, you can try the 1M token context window in the following ways: In Google AI Studio With pay-as-you-go plans With free-of-charge plans for select models See the latest free-of-charge rate limits per model on rate limits page . How can I calculate the number of tokens I'm using? Use the GenerativeModel.count_tokens method to count the number of tokens. Refer to the Tokens guide to learn more about tokens. Can I use my Google Cloud credits with the Gemini API? Yes, Google Cloud credits can be used towards Gemini API usage. How is billing handled? Billing for the Gemini API is handled by the Cloud Billing system. Am I charged for failed requests? If your request fails with a 400 or 500 error, you won't be charged for the tokens used. However, the request will still count against your quota. Is there a charge for fine-tuning the models? Model tuning is free, but inference on tuned models is charged at the same rate as the base models. Is GetTokens billed? Requests to the GetTokens API are not billed, and they don't count against inference quota. How is my Google AI Studio data handled if I have a paid API account? Refer to the terms for details on how data is handled when Cloud billing is enabled (see "How Google Uses Your Data" under "Paid Services"). Note that your Google AI Studio prompts are treated under the same "Paid Services" terms so long as at least 1 API project has billing enabled, which you can validate on the Gemini API Key page if you see any projects marked as "Paid" under "Plan". Where can I get help with billing? To get help with billing, see Get Cloud Billing support . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered \ No newline at end of file diff --git a/docstore/93ff63e9-daa6-44a0-bfd4-d36ff8daff47 b/docstore/93ff63e9-daa6-44a0-bfd4-d36ff8daff47 new file mode 100644 index 0000000000000000000000000000000000000000..e7df3841ca47d093238d5ecfeb9f7cc95942f8e5 --- /dev/null +++ b/docstore/93ff63e9-daa6-44a0-bfd4-d36ff8daff47 @@ -0,0 +1 @@ +data image2Path := "path/to/image2.jpeg" imgBytes , _ := os . ReadFile ( image2Path ) parts := [] * genai . Part { genai . NewPartFromText ( "What is different between these two images?" ), genai . NewPartFromBytes ( imgBytes , "image/jpeg" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST # Upload the first image IMAGE1_PATH = "path/to/image1.jpg" MIME1_TYPE = $( file -b --mime-type " ${ IMAGE1_PATH } " ) NUM1_BYTES = $( wc -c < " ${ IMAGE1_PATH } " ) DISPLAY_NAME1 = IMAGE1 tmp_header_file1 = upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME1_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME1 } '}}" 2 > /dev/null upload_url1 = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file1 } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file1 } " curl " ${ upload_url1 } " \ -H "Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE1_PATH } " 2 > /dev/null > file_info1.json file1_uri = $( jq ".file.uri" file_info1.json ) echo file1_uri = $file1_uri # Prepare the second image (inline) IMAGE2_PATH = "path/to/image2.png" MIME2_TYPE = $( file -b --mime-type " ${ IMAGE2_PATH } " ) if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMAGE2_BASE64 = $( base64 $B64FLAGS $IMAGE2_PATH ) # Now generate content using both images curl \ No newline at end of file diff --git a/docstore/94080375-ee96-4383-8c06-f4f74b2486f9 b/docstore/94080375-ee96-4383-8c06-f4f74b2486f9 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/94080375-ee96-4383-8c06-f4f74b2486f9 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/9409341b-7460-4752-b91c-5058671fb128 b/docstore/9409341b-7460-4752-b91c-5058671fb128 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/9409341b-7460-4752-b91c-5058671fb128 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/943613ac-06ef-435b-b0ad-340b6ec13195 b/docstore/943613ac-06ef-435b-b0ad-340b6ec13195 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/943613ac-06ef-435b-b0ad-340b6ec13195 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/94406976-f344-436a-8bd1-34853badfdb8 b/docstore/94406976-f344-436a-8bd1-34853badfdb8 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/94406976-f344-436a-8bd1-34853badfdb8 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/946227f2-eba9-45d5-9503-c7b20ccf48a0 b/docstore/946227f2-eba9-45d5-9503-c7b20ccf48a0 new file mode 100644 index 0000000000000000000000000000000000000000..aa09fa8779a782eb0f4519da995c2b766869468f --- /dev/null +++ b/docstore/946227f2-eba9-45d5-9503-c7b20ccf48a0 @@ -0,0 +1 @@ +genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } Refer to timestamps You can refer to specific sections of an audio file using timestamps of the form MM:SS . For example, the following prompt requests a transcript that Starts at 2 minutes 30 seconds from the beginning of the file. Ends at 3 minutes 29 seconds from the beginning of the file. Python # Create a prompt containing timestamps. prompt = "Provide a transcript of the speech from 02:30 to 03:29." JavaScript // Create a prompt containing timestamps. const prompt = "Provide a transcript of the speech from 02:30 to 03:29." Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } localAudioPath := "/path/to/sample.mp3" uploadedFile , _ := client . Files . UploadFromPath ( ctx , localAudioPath , nil , ) parts := [] * genai . Part { genai . NewPartFromText ( "Provide a transcript of the speech " + "between the timestamps 02:30 and 03:29." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } Count tokens Call the countTokens method to get a count of the number of tokens in an audio file. For example: Python response = client . models . count_tokens ( model = 'gemini-2.5-flash' , contents = [ myfile ] ) print ( response ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); const myfile = await ai \ No newline at end of file diff --git a/docstore/94c86655-f03c-4675-97e8-753cd922ccfb b/docstore/94c86655-f03c-4675-97e8-753cd922ccfb new file mode 100644 index 0000000000000000000000000000000000000000..cc967ffee72bfb803a27c22b45a7304a203928ac --- /dev/null +++ b/docstore/94c86655-f03c-4675-97e8-753cd922ccfb @@ -0,0 +1 @@ +the model to parse the problem before answering the prompt. Prompt Response Parse the formula in the image first. Then based on the formula, answer "what's the 4th term in the sequence?". Click to expand the result $$ b(n) = b(n-1) * (-3) $$ $$ b(1) = 15 $$ Substitute n = 4 into the formula: $$ b(4) = b(4-1) * (-3) $$ $$ b(4) = b(3) * (-3) $$ We don't know the value of b(3), so substitute n = 3 into the formula: $$ b(3) = b(3-1) * (-3) $$ $$ b(3) = b(2) * (-3) $$ We don't know the value of b(2), so substitute n = 2 into the formula: $$ b(2) = b(2-1) * (-3) $$ $$ b(2) = b(1) * (-3) $$ We know the value of b(1) is 15, so substitute it into the formula: $$ b(2) = 15 * (-3) $$ $$ b(2) = -45 $$ Now that we know the value of b(2), substitute it into the formula for b(3): $$ b(3) = b(2) * (-3) $$ $$ b(3) = -45 * (-3) $$ $$ b(3) = 135 $$ Now that we know the value of b(3), substitute it into the formula for b(4): $$ b(4) = b(3) * (-3) $$ $$ b(4) = 135 * (-3) $$ $$ b(4) = -405 $$ Therefore, the 4th term in the sequence is -405. This model response is correct - now to make the response even more streamlined, you can ask the model to to think step-by-step. Updated prompt Improved response What is the 4th term in the sequence? Think step by step. The first term is 15. The second term is b(1) * (-3) = 15 * (-3) = -45. The third term is b(2) * (-3) = -45 * (-3) = 135. The fourth term is b(3) * (-3) = 135 * (-3) = -405. Specify the output format A common problem is the need for model output to be in a certain format (e.g. markdown, JSON, HTML), especially if the model output needs to be ingested by a downstream task. You can try to produce output in that format by instructing the model to do so within the prompt. Below are two examples: Prompt Model response Parse the table in this image into markdown format | Subject | Status | Time | Start date | Due on | | ------- | ------ | ---- | ---------- | ------ | | Math | Done | 1.5 hrs | 11/29/2023 | 12/1/2023 | | Geography | Done | 30 \ No newline at end of file diff --git a/docstore/94d16dfe-9d75-4a9f-b319-be2502e2788e b/docstore/94d16dfe-9d75-4a9f-b319-be2502e2788e new file mode 100644 index 0000000000000000000000000000000000000000..a8bbf59ddedb6c7213df1bd811fe6d2ce8316cfa --- /dev/null +++ b/docstore/94d16dfe-9d75-4a9f-b319-be2502e2788e @@ -0,0 +1 @@ +. live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-exp-native-audio-thinking-dialog' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Voice Activity Detection (VAD) Voice Activity Detection (VAD) allows the model to recognize when a person is speaking. This is essential for creating natural conversations, as it allows a user to interrupt the model at any time. When VAD detects an interruption, the ongoing generation is canceled and discarded. Only the information already sent to the client is retained in the session history. The server then sends a BidiGenerateContentServerContent message to report the interruption. The Gemini server then discards any pending function calls and sends a BidiGenerateContentServerContent message with the IDs of the canceled calls. Python async for response in session . receive (): if response . server_content . interrupted is True : # The generation was interrupted # If realtime playback is implemented in your application, # you should stop playing audio and clear queued playback here. JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . interrupted ) { // The generation was interrupted // If realtime playback is implemented in your application, // you should stop playing audio and clear queued playback here. } } Automatic VAD By default, the model automatically performs VAD on a continuous audio input stream. VAD can be configured with the realtimeInputConfig.automaticActivityDetection field of the setup configuration . When the audio stream is paused for more than a second (for example, because the user switched off the microphone), an audioStreamEnd event \ No newline at end of file diff --git a/docstore/94e58fb3-08c0-47e7-a5d5-c8297ce98cc0 b/docstore/94e58fb3-08c0-47e7-a5d5-c8297ce98cc0 new file mode 100644 index 0000000000000000000000000000000000000000..caeb3ce515fa32fbe54079666313c34a9d6ea8d0 --- /dev/null +++ b/docstore/94e58fb3-08c0-47e7-a5d5-c8297ce98cc0 @@ -0,0 +1 @@ +picked up automatically by the client when using the Gemini API libraries . Otherwise you will need to pass your API key as an argument when initializing the client. Note that all code samples in the Gemini API docs assume that you have set the environment variable GEMINI_API_KEY . Python from google import genai # The client gets the API key from the environment variable `GEMINI_API_KEY`. client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Explain how AI works in a few words" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; // The client gets the API key from the environment variable `GEMINI_API_KEY`. const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works in a few words" , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () // The client gets the API key from the environment variable `GEMINI_API_KEY`. client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , err := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) if err != nil { log . Fatal ( err ) } fmt . Println ( result . Text ()) } Java package com.example ; import com.google.genai.Client ; import com.google.genai.types.GenerateContentResponse ; public class GenerateTextFromTextInput { public static void main ( String [] args ) { // The client gets the API key from the environment variable `GEMINI_API_KEY`. Client client = new Client (); GenerateContentResponse response = client . models . generateContent ( "gemini-2.5-flash" , "Explain how AI works in a few words" , null ); System . out . println ( response . text ()); } } Apps Script // See \ No newline at end of file diff --git a/docstore/94fb6c5f-99f8-47b4-8720-65972ccb3904 b/docstore/94fb6c5f-99f8-47b4-8720-65972ccb3904 new file mode 100644 index 0000000000000000000000000000000000000000..cd05fff49dc646621e4ad5455e6cddce9e307548 --- /dev/null +++ b/docstore/94fb6c5f-99f8-47b4-8720-65972ccb3904 @@ -0,0 +1 @@ +models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents ) # See thought signatures for part in response . candidates [ 0 ] . content . parts : if part . thought_signature : print ( "Thought signature:" ) print ( part . thought_signature ) Returning signatures back to the server In order to return signatures back: You should return signatures along with their containing parts back to the server You shouldn't merge a part with a signature with another part which also contains a signature. The signature string is not concatenable You shouldn't merge one part with a signature with another part without a signature. This breaks the correct positioning of the thought represented by the signature. The code will remain the same as in Step 4 of the previous section. But in this case (as indicated in the comment below) you will return signatures to the model along with the result of the function execution so the model can incorporate the thoughts into its final response: Python # Step 4: Create user friendly response with function result and call the model again # ...Create a function response part (No change) # Append thought signatures, function call and result of the function execution to contents function_call_content = response . candidates [ 0 ] . content # Append the model's function call message, which includes thought signatures contents . append ( function_call_content ) contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Step 4: Create user friendly response with function result and call the model again // ...Create a function response part (No change) // Append thought signatures, function call and result of the function execution to contents const function_response_content = response . \ No newline at end of file diff --git a/docstore/95058626-1846-47a5-92a0-255086abd57b b/docstore/95058626-1846-47a5-92a0-255086abd57b new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/95058626-1846-47a5-92a0-255086abd57b @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/9508f1b5-d787-4c30-9053-0866fa9a8fe9 b/docstore/9508f1b5-d787-4c30-9053-0866fa9a8fe9 new file mode 100644 index 0000000000000000000000000000000000000000..76ace95fe58e8b11b8d859b0314962fe72bedd09 --- /dev/null +++ b/docstore/9508f1b5-d787-4c30-9053-0866fa9a8fe9 @@ -0,0 +1 @@ +() { const payload = { contents : [ { parts : [ { text : 'Explain how AI works' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Multi-turn conversations (Chat) Our SDKs provide functionality to collect multiple rounds of prompts and responses into a chat, giving you an easy way to keep track of the conversation history. Note: Chat functionality is only implemented as part of the SDKs. Behind the scenes, it still uses the generateContent API. For multi-turn conversations, the full conversation history is sent to the model with each follow-up turn. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message ( "I have 2 dogs in my house." ) print ( response . text ) response = chat . send_message ( "How many paws are in my house?" ) print ( response . text ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const response1 = await chat . sendMessage ({ message : "I have 2 dogs in my house." , }); console . log ( "Chat response 1:" , response1 . text ); const response2 = await chat . sendMessage ({ message : "How many paws are in \ No newline at end of file diff --git a/docstore/950ba40a-a1ac-42ae-96b5-641964a3a0ea b/docstore/950ba40a-a1ac-42ae-96b5-641964a3a0ea new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/950ba40a-a1ac-42ae-96b5-641964a3a0ea @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/95102f70-deaa-4398-a776-259516b6ef08 b/docstore/95102f70-deaa-4398-a776-259516b6ef08 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/95102f70-deaa-4398-a776-259516b6ef08 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/955749c8-172f-4a45-87fe-6c6217bef772 b/docstore/955749c8-172f-4a45-87fe-6c6217bef772 new file mode 100644 index 0000000000000000000000000000000000000000..192b0b2416dcb93d71b59b67d2be295b0006372b --- /dev/null +++ b/docstore/955749c8-172f-4a45-87fe-6c6217bef772 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/live-session#goaway-message Title: Session management with Live API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/955f6f21-0f47-4549-8b94-cc151731e869 b/docstore/955f6f21-0f47-4549-8b94-cc151731e869 new file mode 100644 index 0000000000000000000000000000000000000000..dfeae8fcf584330ed11cdd48e07105d5f4f56b31 --- /dev/null +++ b/docstore/955f6f21-0f47-4549-8b94-cc151731e869 @@ -0,0 +1 @@ +retrieval_tool ] ) response = client . models . generate_content ( model = 'gemini-1.5-flash' , contents = "Who won the euro 2024?" , config = config , ) print ( response . text ) if not response . candidates [ 0 ] . grounding_metadata : print ( " \n Model answered from its own knowledge." ) JavaScript // Note: This is a legacy approach for Gemini 1.5 models. // The 'googleSearch' tool is recommended for all new development. import { GoogleGenAI , DynamicRetrievalConfigMode } from "@google/genai" ; const ai = new GoogleGenAI ({}); const retrievalTool = { googleSearchRetrieval : { dynamicRetrievalConfig : { mode : DynamicRetrievalConfigMode . MODE_DYNAMIC , dynamicThreshold : 0.7 , // Only search if confidence > 70% }, }, }; const config = { tools : [ retrievalTool ], }; const response = await ai . models . generateContent ({ model : "gemini-1.5-flash" , contents : "Who won the euro 2024?" , config , }); console . log ( response . text ); if ( ! response . candidates ? .[ 0 ] ? . groundingMetadata ) { console . log ( "\nModel answered from its own knowledge." ); } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -X POST \ -d '{ "contents": [ {"parts": [{"text": "Who won the euro 2024?"}]} ], "tools": [{ "google_search_retrieval": { "dynamic_retrieval_config": { "mode": "MODE_DYNAMIC", "dynamic_threshold": 0.7 } } }] }' What's next Try the Grounding with Google Search in the Gemini API Cookbook . Learn about other available tools, like Function Calling . Learn how to augment prompts with specific URLs using the URL context tool . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. \ No newline at end of file diff --git a/docstore/957fdc3d-2333-4eed-8660-dafe17e759c0 b/docstore/957fdc3d-2333-4eed-8660-dafe17e759c0 new file mode 100644 index 0000000000000000000000000000000000000000..002d74f0081d5a1754ecf09d829f2e05938acd13 --- /dev/null +++ b/docstore/957fdc3d-2333-4eed-8660-dafe17e759c0 @@ -0,0 +1 @@ +turn_on_the_lights , turn_off_the_lights ] } ] const config = { responseModalities : [ Modality . TEXT ], tools : tools } // ... remaining model call What's next Check out more examples of using tools with the Live API in the Tool use cookbook . Get the full story on features and configurations from the Live API Capabilities guide . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/9591a328-cc27-44c0-81bc-7cfdc9ecd5bf b/docstore/9591a328-cc27-44c0-81bc-7cfdc9ecd5bf new file mode 100644 index 0000000000000000000000000000000000000000..facd4f50718f089582ed06a7b9e2ea144ce56d9c --- /dev/null +++ b/docstore/9591a328-cc27-44c0-81bc-7cfdc9ecd5bf @@ -0,0 +1 @@ +instructions, such as: "Show bounding boxes of all green objects in this image". It also support custom labels like "label the items with the allergens they can contain". For more examples, check following notebooks in the Gemini Cookbook : 2D spatial understanding notebook Experimental 3D pointing notebook Segmentation Starting with Gemini 2.5, models not only detect items but also segment them and provide their contour masks. The model predicts a JSON list, where each item represents a segmentation mask. Each item has a bounding box (" box_2d ") in the format [y0, x0, y1, x1] with normalized coordinates between 0 and 1000, a label (" label ") that identifies the object, and finally the segmentation mask inside the bounding box, as base64 encoded png that is a probability map with values between 0 and 255. The mask needs to be resized to match the bounding box dimensions, then binarized at your confidence threshold (127 for the midpoint). Note: For better results, disable thinking by setting the thinking budget to 0. See code sample below for an example. Python from google import genai from google.genai import types from PIL import Image , ImageDraw import io import base64 import json import numpy as np import os client = genai . Client () def parse_json ( json_output : str ): # Parsing out the markdown fencing lines = json_output . splitlines () for i , line in enumerate ( lines ): if line == "```json" : json_output = " \n " . join ( lines [ i + 1 :]) # Remove everything before "```json" output = json_output . split ( "```" )[ 0 ] # Remove everything after the closing "```" break # Exit the loop once "```json" is found return json_output def extract_segmentation_masks ( image_path : str , output_dir : str = "segmentation_outputs" ): # Load and resize image im = Image . open ( image_path ) im . thumbnail ([ 1024 , 1024 ], Image . Resampling . LANCZOS ) prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks \ No newline at end of file diff --git a/docstore/95b07f81-a188-4696-a877-4674f56a7656 b/docstore/95b07f81-a188-4696-a877-4674f56a7656 new file mode 100644 index 0000000000000000000000000000000000000000..0abe7c770a1c93708b98ee8b0a34df5d347d5c9d --- /dev/null +++ b/docstore/95b07f81-a188-4696-a877-4674f56a7656 @@ -0,0 +1 @@ +candidates [ 0 ]. content ; contents . push ( function_response_content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); The following shows what a request returning a thought signature may look like: [{ "contents" : [ { "role" : "user" , "parts" : [ { "text" : "what is the weather in Lake Tahoe?" } ] } , { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : "CiIBVKhc7oDPpCaXyJKKssjqr4g3JNOSgJ/M2V+1THC1icsWCmwBVKhc7pBABbZ+zR3e9234WnWWS6GFXmf8IVwpnzjd5KYd7vyJbn/4vTorWBGayj/vbd9JPaZQjxdAIXhoE5mX/MDsQ7M9N/b0qJjHm39tYIBvS4sIWkMDHqTJqXGLzhhKtrTkfbV3RbaJEkQKmwEBVKhc7qVUgC3hfTXZLo9R3AJzUUIx50NKvJTb9B+UU+LBqgg7Nck1x5OpjWVS2R+SsveprIuYOruk2Y0H53J2OJF8qsxTdIq2si8DGW2V7WK8xyoJH5kbqd7drIw1jLb44b6lx4SMyB0VaULuTBki4d+Ljjg1tJTwR0IYMKqDLDZt9mheINsi0ZxcNjfpnDydRXdWbcSwzmK/wgqJAQFUqFzuKgNVElxs3cbO+xebr2IwcOro84nKTisi0tTp9bICPC9fTUhn3L+rvQWA+d3J1Za8at2bakrqiRj7BTh+CVO9fWQMAEQAs3ni0Z2hfaYG92tOD26E4IoZwyYEoWbfNudpH1fr5tEkyqnEGtWIh7H+XoZQ2DXeiOa+br7Zk88SrNE+trJMCogBAVSoXO5e9fBLg7hnbkmKsrzNLnQtLsQm1gNzjcjEC7nJYklYPp0KI2uGBE1PkM8XNsfllAfHVn7LzHcHNlbQ9pJ7QZTSIeG42goS971r5wNZwxaXwCTphClQh826eqJWo6A/28TtAVQWLhTx5ekbP7qb4nh1UblESZ1saxDQAEo4OKPbDzx5BgqKAQFUqFzuVyjNm5i0wN8hTDnKjfpDroEpPPTs531iFy9BOX+xDCdGHy8D+osFpaoBq6TFekQQbz4hIoUR1YEcP4zI80/cNimEeb9IcFxZTTxiNrbhbbcv0969DSMWhB+ZEqIz4vuw4GLe/xcUvqhlChQwFdgIbdOQHSHpatn5uDlktnP/bi26nKuXIwo0AVSoXO7US22OUH7d1f4abNPI0IyAvhqkPp12rbtWLx9vkOtojE8IP+xCfYtIFuZIzRNZqA==" } ] , "role" : "model" } , { "role" : "user" , "parts" : [ { "functionResponse" : { "name" : "getWeather" , "response" : { "response" : { "stringValue" : "Sunny and hot. 90 degrees Fahrenheit" } } } } ] } ] , # Remainder of request... Learn more about limitations and usage of thought signatures, and about \ No newline at end of file diff --git a/docstore/95b1269f-ec6c-4573-b0b1-0b8b52fa2e90 b/docstore/95b1269f-ec6c-4573-b0b1-0b8b52fa2e90 new file mode 100644 index 0000000000000000000000000000000000000000..5a67c041917cdaf904b0e03794a07af474503a9a --- /dev/null +++ b/docstore/95b1269f-ec6c-4573-b0b1-0b8b52fa2e90 @@ -0,0 +1 @@ +upload..." curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D ${ tmp_header_file } \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " echo "Uploading video data..." curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ VIDEO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri echo "File uploaded successfully. File URI: ${ file_uri } " # --- 3. Generate content using the uploaded video file --- echo "Generating content from video..." curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Summarize this video. Then create a quiz with an answer key based on the information in this video."}] }] }' 2 > /dev/null > response.json jq -r ".candidates[].content.parts[].text" response.json To learn more about working with media files, see Files API . Pass video data inline Instead of uploading a video file using the File API, you can pass smaller videos directly in the request to generateContent . This is suitable for shorter videos under 20MB total request size. Here's an example of providing inline video data: Python # Only for videos of size <20Mb video_file_name = "/path/to/your/video.mp4" video_bytes = \ No newline at end of file diff --git a/docstore/95bc784e-fc92-4052-901e-5b45fc659605 b/docstore/95bc784e-fc92-4052-901e-5b45fc659605 new file mode 100644 index 0000000000000000000000000000000000000000..9b0480f81615786d4da1611ac72ab96b5af43602 --- /dev/null +++ b/docstore/95bc784e-fc92-4052-901e-5b45fc659605 @@ -0,0 +1 @@ +"CiIBVKhc7oDPpCaXyJKKssjqr4g3JNOSgJ/M2V+1THC1icsWCmwBVKhc7pBABbZ+zR3e9234WnWWS6GFXmf8IVwpnzjd5KYd7vyJbn/4vTorWBGayj/vbd9JPaZQjxdAIXhoE5mX/MDsQ7M9N/b0qJjHm39tYIBvS4sIWkMDHqTJqXGLzhhKtrTkfbV3RbaJEkQKmwEBVKhc7qVUgC3hfTXZLo9R3AJzUUIx50NKvJTb9B+UU+LBqgg7Nck1x5OpjWVS2R+SsveprIuYOruk2Y0H53J2OJF8qsxTdIq2si8DGW2V7WK8xyoJH5kbqd7drIw1jLb44b6lx4SMyB0VaULuTBki4d+Ljjg1tJTwR0IYMKqDLDZt9mheINsi0ZxcNjfpnDydRXdWbcSwzmK/wgqJAQFUqFzuKgNVElxs3cbO+xebr2IwcOro84nKTisi0tTp9bICPC9fTUhn3L+rvQWA+d3J1Za8at2bakrqiRj7BTh+CVO9fWQMAEQAs3ni0Z2hfaYG92tOD26E4IoZwyYEoWbfNudpH1fr5tEkyqnEGtWIh7H+XoZQ2DXeiOa+br7Zk88SrNE+trJMCogBAVSoXO5e9fBLg7hnbkmKsrzNLnQtLsQm1gNzjcjEC7nJYklYPp0KI2uGBE1PkM8XNsfllAfHVn7LzHcHNlbQ9pJ7QZTSIeG42goS971r5wNZwxaXwCTphClQh826eqJWo6A/28TtAVQWLhTx5ekbP7qb4nh1UblESZ1saxDQAEo4OKPbDzx5BgqKAQFUqFzuVyjNm5i0wN8hTDnKjfpDroEpPPTs531iFy9BOX+xDCdGHy8D+osFpaoBq6TFekQQbz4hIoUR1YEcP4zI80/cNimEeb9IcFxZTTxiNrbhbbcv0969DSMWhB+ZEqIz4vuw4GLe/xcUvqhlChQwFdgIbdOQHSHpatn5uDlktnP/bi26nKuXIwo0AVSoXO7US22OUH7d1f4abNPI0IyAvhqkPp12rbtWLx9vkOtojE8IP+xCfYtIFuZIzRNZqA==" } ] , "role" : "model" } , { "role" : "user" , "parts" : [ { "functionResponse" : { "name" : "getWeather" , "response" : { "response" : { "stringValue" : "Sunny and hot. 90 degrees Fahrenheit" } } } } ] } ] , # Remainder of request... Learn more about limitations and usage of thought signatures, and about thinking models in general, on the Thinking page. Parallel function calling In addition to single turn function calling, you can also call multiple functions at once. Parallel function calling lets you execute multiple functions at once and is used when the functions are not dependent on each other. This is useful in scenarios like gathering data from multiple independent sources, such as retrieving customer details from different databases or checking inventory levels across various warehouses or performing multiple actions such as converting your apartment into a disco. Python power_disco_ball = { "name" : "power_disco_ball" , "description" \ No newline at end of file diff --git a/docstore/95d2e1e9-38fc-469d-b64c-7b2d82a2d3fd b/docstore/95d2e1e9-38fc-469d-b64c-7b2d82a2d3fd new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/95d2e1e9-38fc-469d-b64c-7b2d82a2d3fd @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/95e8ff08-e9fb-4955-900b-c4d863f47339 b/docstore/95e8ff08-e9fb-4955-900b-c4d863f47339 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/95e8ff08-e9fb-4955-900b-c4d863f47339 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/95f1245b-22e0-43d8-992f-f3d04b22103f b/docstore/95f1245b-22e0-43d8-992f-f3d04b22103f new file mode 100644 index 0000000000000000000000000000000000000000..5517c0bf6b0252fb1080acbdd22b2b409d2663d8 --- /dev/null +++ b/docstore/95f1245b-22e0-43d8-992f-f3d04b22103f @@ -0,0 +1 @@ +signatures, function call and result of the function execution to contents function_call_content = response . candidates [ 0 ] . content # Append the model's function call message, which includes thought signatures contents . append ( function_call_content ) contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Step 4: Create user friendly response with function result and call the model again // ...Create a function response part (No change) // Append thought signatures, function call and result of the function execution to contents const function_response_content = response . candidates [ 0 ]. content ; contents . push ( function_response_content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); The following shows what a request returning a thought signature may look like: [{ "contents" : [ { "role" : "user" , "parts" : [ { "text" : "what is the weather in Lake Tahoe?" } ] } , { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/95f7b153-0e4f-451e-af72-555913106fb8 b/docstore/95f7b153-0e4f-451e-af72-555913106fb8 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/95f7b153-0e4f-451e-af72-555913106fb8 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/95fba0c5-2fa8-4c1a-b365-8e86b7df545f b/docstore/95fba0c5-2fa8-4c1a-b365-8e86b7df545f new file mode 100644 index 0000000000000000000000000000000000000000..4698c2cf5d2dc524303259a813fe032a26136eee --- /dev/null +++ b/docstore/95fba0c5-2fa8-4c1a-b365-8e86b7df545f @@ -0,0 +1 @@ +blurring the background into a sea of neon colors and indistinct shadows, creating a sense of urgency and isolation. A more detailed prompt results in a video that is more focused with a richer environment. A video with smooth motion that dollies in on a desperate man in a green trench coat, using a vintage rotary phone against a wall bathed in an eerie green neon glow. The camera starts from a medium distance, slowly moving closer to the man's face, revealing his frantic expression and the sweat on his brow as he urgently dials the phone. The focus is on the man's hands, his fingers fumbling with the dial as he desperately tries to connect. The green neon light casts long shadows on the wall, adding to the tense atmosphere. The scene is framed to emphasize the isolation and desperation of the man, highlighting the stark contrast between the vibrant glow of the neon and the man's grim determination. Adding more detail gives the subject a realistic expression and creates an intense and vibrant scene. Snow leopard This example demonstrates the output Veo might generate for a simple prompt. Prompt Generated output A cute creature with snow leopard-like fur is walking in winter forest, 3D cartoon style render. Running snow leopard This prompt has more detail and demonstrates generated output that might be closer to what you want in your video. Prompt Generated output Create a short 3D animated scene in a joyful cartoon style. A cute creature with snow leopard-like fur, large expressive eyes, and a friendly, rounded form happily prances through a whimsical winter forest. The scene should feature rounded, snow-covered trees, gentle falling snowflakes, and warm sunlight filtering through the branches. The creature's bouncy movements and wide smile should convey pure delight. Aim for an upbeat, heartwarming tone with bright, cheerful colors and playful animation. Examples by writing elements These examples show you how to refine your prompts by each basic element. Subject \ No newline at end of file diff --git a/docstore/96006c49-8412-4106-8b06-c1c3ac50978b b/docstore/96006c49-8412-4106-8b06-c1c3ac50978b new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/96006c49-8412-4106-8b06-c1c3ac50978b @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/96058835-bfa3-464b-8266-3c13d9d4fe5f b/docstore/96058835-bfa3-464b-8266-3c13d9d4fe5f new file mode 100644 index 0000000000000000000000000000000000000000..facd4f50718f089582ed06a7b9e2ea144ce56d9c --- /dev/null +++ b/docstore/96058835-bfa3-464b-8266-3c13d9d4fe5f @@ -0,0 +1 @@ +instructions, such as: "Show bounding boxes of all green objects in this image". It also support custom labels like "label the items with the allergens they can contain". For more examples, check following notebooks in the Gemini Cookbook : 2D spatial understanding notebook Experimental 3D pointing notebook Segmentation Starting with Gemini 2.5, models not only detect items but also segment them and provide their contour masks. The model predicts a JSON list, where each item represents a segmentation mask. Each item has a bounding box (" box_2d ") in the format [y0, x0, y1, x1] with normalized coordinates between 0 and 1000, a label (" label ") that identifies the object, and finally the segmentation mask inside the bounding box, as base64 encoded png that is a probability map with values between 0 and 255. The mask needs to be resized to match the bounding box dimensions, then binarized at your confidence threshold (127 for the midpoint). Note: For better results, disable thinking by setting the thinking budget to 0. See code sample below for an example. Python from google import genai from google.genai import types from PIL import Image , ImageDraw import io import base64 import json import numpy as np import os client = genai . Client () def parse_json ( json_output : str ): # Parsing out the markdown fencing lines = json_output . splitlines () for i , line in enumerate ( lines ): if line == "```json" : json_output = " \n " . join ( lines [ i + 1 :]) # Remove everything before "```json" output = json_output . split ( "```" )[ 0 ] # Remove everything after the closing "```" break # Exit the loop once "```json" is found return json_output def extract_segmentation_masks ( image_path : str , output_dir : str = "segmentation_outputs" ): # Load and resize image im = Image . open ( image_path ) im . thumbnail ([ 1024 , 1024 ], Image . Resampling . LANCZOS ) prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks \ No newline at end of file diff --git a/docstore/961531c8-4dfe-4b7c-a8a9-34c28c51906f b/docstore/961531c8-4dfe-4b7c-a8a9-34c28c51906f new file mode 100644 index 0000000000000000000000000000000000000000..6a0f5762f2e47222d475421a2613ce0f732fa260 --- /dev/null +++ b/docstore/961531c8-4dfe-4b7c-a8a9-34c28c51906f @@ -0,0 +1 @@ +in the Gemini API by setting clipping intervals or providing custom frame rate sampling. Tip: Video clipping and frames per second (FPS) are supported by all models, but the quality is significantly higher from 2.5 series models. Set clipping intervals You can clip video by specifying videoMetadata with start and end offsets. Python response = client . models . generate_content ( model = 'models/gemini-2.5-flash-preview-05-20' , contents = types . Content ( parts = [ types . Part ( file_data = types . FileData ( file_uri = 'https://www.youtube.com/watch?v=XEzRZ35urlk' ), video_metadata = types . VideoMetadata ( start_offset = '1250s' , end_offset = '1570s' ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) Set a custom frame rate You can set custom frame rate sampling by passing an fps argument to videoMetadata . Python # Only for videos of size <20Mb video_file_name = "/path/to/your/video.mp4" video_bytes = open ( video_file_name , 'rb' ) . read () response = client . models . generate_content ( model = 'models/gemini-2.5-flash-preview-05-20' , contents = types . Content ( parts = [ types . Part ( inline_data = types . Blob ( data = video_bytes , mime_type = 'video/mp4' ), video_metadata = types . VideoMetadata ( fps = 5 ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) By default 1 frame per second (FPS) is sampled from the video. You might want to set low FPS (< 1) for long videos. This is especially useful for mostly static videos (e.g. lectures). If you want to capture more details in rapidly changing visuals, consider setting a higher FPS value. Supported video formats Gemini supports the following video format MIME types: video/mp4 video/mpeg video/mov video/avi video/x-flv video/mpg video/webm video/wmv video/3gpp Technical details about videos Supported models & context : All Gemini 2.0 and 2.5 models can process video data. Models with a 2M context window can process videos up to 2 hours long at \ No newline at end of file diff --git a/docstore/962e3aec-ec05-44c6-bfae-0dcc09c70678 b/docstore/962e3aec-ec05-44c6-bfae-0dcc09c70678 new file mode 100644 index 0000000000000000000000000000000000000000..109bd0eaeef926ee02f7659dee842724a4ad9423 --- /dev/null +++ b/docstore/962e3aec-ec05-44c6-bfae-0dcc09c70678 @@ -0,0 +1 @@ +the libraries are stable and fully supported for production use. They are actively maintained, provide access to the latest features, and offer the best performance working with Gemini. If you're not using the Google GenAI SDK and using one of our legacy libraries, we strongly recommend you to migrate. Review the legacy libraries section for more information. Legacy libraries and migration If you are using one of our legacy libraries, we recommend that you migrate to the new libraries . The legacy libraries don't provide access to recent features (such as Live API and Veo ) and are on a deprecation path. They will stop receiving updates at the end of September 2025, the feature gaps will grow and potential bugs may no longer get fixed. Each legacy library's support status varies, detailed in the following table: Language Legacy library Support status Recommended library Python google-generativeai All support, including bug fixes, ends end of September 2025. google-genai JavaScript/TypeScript @google/generativeai All support, including bug fixes, ends end of September 2025. @google/genai Go google.golang.org/generative-ai All support, including bug fixes, ends end of September 2025. google.golang.org/genai Dart and Flutter google_generative_ai Not actively maintained Use trusted community or third party libraries, like firebase_ai , or access using REST API Swift generative-ai-swift Not actively maintained Use Gemini in Firebase Android generative-ai-android Not actively maintained Use Gemini in Firebase Note for Java developers: There was no legacy Google-provided Java SDK for the Gemini API, so no migration from a previous Google library is required. You can start directly with the new library in the Language support and installation section. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google \ No newline at end of file diff --git a/docstore/962ef9f3-d530-417c-8a38-ffa1602481bb b/docstore/962ef9f3-d530-417c-8a38-ffa1602481bb new file mode 100644 index 0000000000000000000000000000000000000000..be865665baa597a5b341e658abc6f47e616f09e1 --- /dev/null +++ b/docstore/962ef9f3-d530-417c-8a38-ffa1602481bb @@ -0,0 +1 @@ +OpenAI compatibility | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback OpenAI compatibility Gemini models are accessible using the OpenAI libraries (Python and TypeScript / Javascript) along with the REST API, by updating three lines of code and using your Gemini API key . If you aren't already using the OpenAI libraries, we recommend that you call the Gemini API directly . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.5-flash" , messages = [ { "role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Explain to me how AI works" } ] ) print ( response . choices [ 0 ] . message ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); const response = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : [ { role : "system" , content : "You are a helpful assistant." }, { role : "user" , content : "Explain to me how AI works" , }, ], }); console . log ( response . choices [ 0 ]. message ); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.0-flash", "messages": [ {"role": "user", "content": "Explain to me how AI works"} ] }' What changed? Just three lines! api_key="GEMINI_API_KEY" : Replace " GEMINI_API_KEY " with your actual Gemini API key, \ No newline at end of file diff --git a/docstore/9632611f-957f-452e-9756-ea565dcdcf5a b/docstore/9632611f-957f-452e-9756-ea565dcdcf5a new file mode 100644 index 0000000000000000000000000000000000000000..47c113cd9bacf84a434d531b6a240f0025b74130 --- /dev/null +++ b/docstore/9632611f-957f-452e-9756-ea565dcdcf5a @@ -0,0 +1 @@ +. For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC. \ No newline at end of file diff --git a/docstore/96351df4-4161-4db0-ab59-bc2e502c96d6 b/docstore/96351df4-4161-4db0-ab59-bc2e502c96d6 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/96351df4-4161-4db0-ab59-bc2e502c96d6 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/9643ed2f-5364-49de-81e2-afb0f92ff30e b/docstore/9643ed2f-5364-49de-81e2-afb0f92ff30e new file mode 100644 index 0000000000000000000000000000000000000000..39af00a0c9a9d748f5b17ca6db1ebff5b823541e --- /dev/null +++ b/docstore/9643ed2f-5364-49de-81e2-afb0f92ff30e @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " Delete uploaded files Files are automatically deleted after 48 hours. You can also manually delete an uploaded file: Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) client . files . delete ( name = myfile . name ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; await ai . files . delete ({ name : fileName }); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } client . DeleteFile ( ctx , file . Name ) REST curl --request "DELETE" https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " Usage info You can use the Files API to upload and interact with media files. The Files API lets you store up to 20 GB of files per project, with a per-file maximum size of 2 GB. Files are stored for 48 hours. During that time, you can use the API to get metadata about the files, but you can't download the files. The Files API is available at no cost in all regions where the Gemini API is available. File prompting strategies This section provides guidance and best practices for using media files with prompts for the Gemini API. Being able to use various types of data in your prompts gives you more flexibility in terms of what tasks you can tackle with the Gemini API. For example, you can send the model a photo of a delicious meal and ask it to write a short blog about the meal. Prompt Response Write a short, engaging blog post based on this picture. It should include a description of the meal in the photo and talk about my journey meal prepping. Meal prepping is a great way to save time and money, and it can also help you to eat healthier. This meal is a great example of a healthy and delicious meal that can be easily prepped ahead of time. This \ No newline at end of file diff --git a/docstore/964ccea8-ad47-49bd-aaa4-86f29d5b2254 b/docstore/964ccea8-ad47-49bd-aaa4-86f29d5b2254 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/964ccea8-ad47-49bd-aaa4-86f29d5b2254 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/964f0fad-faa3-43a2-933c-b42b99620dfd b/docstore/964f0fad-faa3-43a2-933c-b42b99620dfd new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/964f0fad-faa3-43a2-933c-b42b99620dfd @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/96664ea2-c345-4e35-ab06-233595b5cbfb b/docstore/96664ea2-c345-4e35-ab06-233595b5cbfb new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/96664ea2-c345-4e35-ab06-233595b5cbfb @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/9672d116-f6f4-4a93-8b10-6fd22c92df7a b/docstore/9672d116-f6f4-4a93-8b10-6fd22c92df7a new file mode 100644 index 0000000000000000000000000000000000000000..6cd03da77a7c04bb143fe9601905375d481c4c1f --- /dev/null +++ b/docstore/9672d116-f6f4-4a93-8b10-6fd22c92df7a @@ -0,0 +1 @@ +Gemini 2.0 Flash Preview Image Generation 2,000 3,000,000 100,000 Gemini 2.0 Flash-Lite 20,000 10,000,000 -- Imagen 4 Standard 10 -- 70 Imagen 4 Ultra 5 -- 30 Imagen 3 20 -- -- Veo 2 2 videos per minute -- 50 videos per day Gemini 1.5 Flash (Deprecated) 2,000 4,000,000 -- Gemini 1.5 Flash-8B (Deprecated) 4,000 4,000,000 -- Gemini 1.5 Pro (Deprecated) 1,000 4,000,000 -- Gemma 3 & 3n 30 15,000 14,400 Gemini Embedding Experimental 03-07 10 -- 1,000 Tier 3 Model RPM TPM RPD Gemini 2.5 Pro 2,000 8,000,000 -- Gemini 2.5 Flash 10,000 8,000,000 -- Gemini 2.5 Flash-Lite Preview 06-17 30,000 30,000,000 -- Gemini 2.5 Flash Preview TTS 1,000 1,000,000 -- Gemini 2.5 Pro Preview TTS 100 1,000,000 -- Gemini 2.0 Flash 30,000 30,000,000 -- Gemini 2.0 Flash Preview Image Generation 5,000 5,000,000 -- Gemini 2.0 Flash-Lite 30,000 30,000,000 -- Imagen 4 Standard 10 -- 70 Imagen 4 Ultra 5 -- 30 Imagen 3 20 -- -- Veo 2 2 videos per minute -- 50 videos per day Gemini 1.5 Flash (Deprecated) 2,000 4,000,000 -- Gemini 1.5 Flash-8B (Deprecated) 4,000 4,000,000 -- Gemini 1.5 Pro (Deprecated) 1,000 4,000,000 -- Gemma 3 & 3n 30 15,000 14,400 Gemini Embedding Experimental 03-07 10 -- 1,000 Specified rate limits are not guaranteed and actual capacity may vary. Live API rate limits The Live API processes continuous streams of audio, video, or text to deliver immediate, human-like spoken responses, creating a natural conversational experience for your users. This API has a different set of rate limits than the standard Gemini API calls. Free Tier Name Concurrent sessions TPM RPD Gemini 2.5 Flash Live 3 1,000,000 -- Gemini 2.0 Flash Live 3 1,000,000 -- Gemini 2.5 Flash Preview Native Audio Dialog 1 25,000 5 Gemini 2.5 Flash Experimental Native Audio Thinking Dialog 1 10,000 5 Tier 1 Name Concurrent sessions TPM RPD Gemini 2.5 Flash Live 50 4,000,000 -- Gemini 2.0 Flash Live 50 4,000,000 -- Gemini 2.5 Flash Preview Native Audio Dialog 3 50,000 50 Gemini 2.5 Flash Experimental Native Audio Thinking \ No newline at end of file diff --git a/docstore/967e054f-3176-4b17-b46f-510d619e2e2a b/docstore/967e054f-3176-4b17-b46f-510d619e2e2a new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/967e054f-3176-4b17-b46f-510d619e2e2a @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/96b0afac-eb96-4ab2-8ace-eb270d9837c2 b/docstore/96b0afac-eb96-4ab2-8ace-eb270d9837c2 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/96b0afac-eb96-4ab2-8ace-eb270d9837c2 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/96b364cc-6e1c-4404-b185-f4c3c90b1658 b/docstore/96b364cc-6e1c-4404-b185-f4c3c90b1658 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/96b364cc-6e1c-4404-b185-f4c3c90b1658 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/96c28b8d-9b73-4f47-8e58-fe57accc4d70 b/docstore/96c28b8d-9b73-4f47-8e58-fe57accc4d70 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/96c28b8d-9b73-4f47-8e58-fe57accc4d70 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/96c4e7a8-090d-4e54-9a12-44b5348800e5 b/docstore/96c4e7a8-090d-4e54-9a12-44b5348800e5 new file mode 100644 index 0000000000000000000000000000000000000000..1fd617a587d76016a0c4d5b56098be9076683928 --- /dev/null +++ b/docstore/96c4e7a8-090d-4e54-9a12-44b5348800e5 @@ -0,0 +1 @@ +candidates [ 0 ] . content ) # Append the content from the model's response. contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Create a function response part const function_response_part = { name : tool_call . name , response : { result } } // Append function call and result of the function execution to contents contents . push ( response . candidates [ 0 ]. content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); // Get the final response from the model const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); This completes the function calling flow. The model successfully used the set_light_values function to perform the request action of the user. Function declarations When you implement function calling in a prompt, you create a tools object, which contains one or more function declarations . You define functions using JSON, specifically with a select subset of the OpenAPI schema format. A single function declaration can include the following parameters: name (string): A unique name for the function ( get_weather_forecast , send_email ). Use descriptive names without spaces or special characters (use underscores or camelCase). description (string): A clear and detailed explanation of the function's purpose and capabilities. This is crucial for the model to understand when to use the function. Be specific and provide examples if helpful ("Finds theaters based on location and optionally movie title which is currently playing in theaters."). parameters (object): Defines the input parameters the function expects. type (string): Specifies the overall data type, \ No newline at end of file diff --git a/docstore/96d6f406-001b-4987-9a7d-d66a7840fb18 b/docstore/96d6f406-001b-4987-9a7d-d66a7840fb18 new file mode 100644 index 0000000000000000000000000000000000000000..53d3e426b4ff6e145f59bebdb86773397956de24 --- /dev/null +++ b/docstore/96d6f406-001b-4987-9a7d-d66a7840fb18 @@ -0,0 +1 @@ +field responseJsonSchema which accepts any JSON Schema with the following limitations: It only works with Gemini 2.5. While all JSON Schema properties can be passed, not all are supported. See the documentation for the field for more details. Recursive references can only be used as the value of a non-required object property. Recursive references are unrolled to a finite degree, based on the size of the schema. Schemas that contain $ref cannot contain any properties other than those starting with a $ . Here's an example of generating a JSON Schema with Pydantic and submitting it to the model: curl "https://generativelanguage.googleapis.com/v1alpha/models/\ gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d @- < setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); \ No newline at end of file diff --git a/docstore/9873d503-19e4-438f-ad12-0fa2ef6d94f1 b/docstore/9873d503-19e4-438f-ad12-0fa2ef6d94f1 new file mode 100644 index 0000000000000000000000000000000000000000..47c113cd9bacf84a434d531b6a240f0025b74130 --- /dev/null +++ b/docstore/9873d503-19e4-438f-ad12-0fa2ef6d94f1 @@ -0,0 +1 @@ +. For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC. \ No newline at end of file diff --git a/docstore/987c86ea-8f81-4fe9-90aa-d2bff9b01c8b b/docstore/987c86ea-8f81-4fe9-90aa-d2bff9b01c8b new file mode 100644 index 0000000000000000000000000000000000000000..d7929c084fe51515c774daa6a820ae62dc558406 --- /dev/null +++ b/docstore/987c86ea-8f81-4fe9-90aa-d2bff9b01c8b @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/function-calling/tutorial Title: Function calling with the Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/989688a2-66f2-4999-a62d-46428842bd9e b/docstore/989688a2-66f2-4999-a62d-46428842bd9e new file mode 100644 index 0000000000000000000000000000000000000000..be865665baa597a5b341e658abc6f47e616f09e1 --- /dev/null +++ b/docstore/989688a2-66f2-4999-a62d-46428842bd9e @@ -0,0 +1 @@ +OpenAI compatibility | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback OpenAI compatibility Gemini models are accessible using the OpenAI libraries (Python and TypeScript / Javascript) along with the REST API, by updating three lines of code and using your Gemini API key . If you aren't already using the OpenAI libraries, we recommend that you call the Gemini API directly . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.5-flash" , messages = [ { "role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Explain to me how AI works" } ] ) print ( response . choices [ 0 ] . message ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); const response = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : [ { role : "system" , content : "You are a helpful assistant." }, { role : "user" , content : "Explain to me how AI works" , }, ], }); console . log ( response . choices [ 0 ]. message ); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.0-flash", "messages": [ {"role": "user", "content": "Explain to me how AI works"} ] }' What changed? Just three lines! api_key="GEMINI_API_KEY" : Replace " GEMINI_API_KEY " with your actual Gemini API key, \ No newline at end of file diff --git a/docstore/98d9ee44-32da-4eb3-b8ca-e546f612372a b/docstore/98d9ee44-32da-4eb3-b8ca-e546f612372a new file mode 100644 index 0000000000000000000000000000000000000000..acef50f8402c486c348013efe146de15b88cb32b --- /dev/null +++ b/docstore/98d9ee44-32da-4eb3-b8ca-e546f612372a @@ -0,0 +1 @@ +Function calling with the Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Function calling with the Gemini API Function calling lets you connect models to external tools and APIs. Instead of generating text responses, the model determines when to call specific functions and provides the necessary parameters to execute real-world actions. This allows the model to act as a bridge between natural language and real-world actions and data. Function calling has 3 primary use cases: Augment Knowledge: Access information from external sources like databases, APIs, and knowledge bases. Extend Capabilities: Use external tools to perform computations and extend the limitations of the model, such as using a calculator or creating charts. Take Actions: Interact with external systems using APIs, such as scheduling appointments, creating invoices, sending emails, or controlling smart home devices. Get Weather Schedule Meeting Create Chart How function calling works Function calling involves a structured interaction between your application, the model, and external functions. Here's a breakdown of the process: Define Function Declaration: Define the function declaration in your application code. Function Declarations describe the function's name, parameters, and purpose to the model. Call LLM with function declarations: Send user prompt along with the function declaration(s) to the model. It analyzes the request and determines if a function call would be helpful. If so, it responds with a structured JSON object. Execute Function Code (Your Responsibility): The Model does not execute the function itself. It's your application's responsibility to \ No newline at end of file diff --git a/docstore/98e403ed-d65f-485e-b9ad-6c5e01dca398 b/docstore/98e403ed-d65f-485e-b9ad-6c5e01dca398 new file mode 100644 index 0000000000000000000000000000000000000000..097e48b20f2cbfa1b05db2a0f80e7f3c1583707a --- /dev/null +++ b/docstore/98e403ed-d65f-485e-b9ad-6c5e01dca398 @@ -0,0 +1 @@ +Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Gemini Developer API Get a Gemini API Key Get a Gemini API key and make your first API request in minutes. Python from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Explain how AI works in a few words" , ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works in a few words" , }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil )) if err != nil { log . Fatal ( err ) } result , err := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) if err != nil { log . Fatal ( err ) } fmt . Println ( result . Text ()) } Java package com.example ; import com.google.genai.Client ; import com.google.genai.types.GenerateContentResponse ; public class GenerateTextFromTextInput { public static void main ( String [] args ) { Client client = new Client (); GenerateContentResponse response = client . models . generateContent ( "gemini-2.5-flash" , "Explain how AI works in a few words" , null ); System . out . println ( response . text ()); } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H \ No newline at end of file diff --git a/docstore/99286e05-7371-4084-bb47-5e064417adad b/docstore/99286e05-7371-4084-bb47-5e064417adad new file mode 100644 index 0000000000000000000000000000000000000000..ebd105342549a255faf01232b49ba70d20b000ef --- /dev/null +++ b/docstore/99286e05-7371-4084-bb47-5e064417adad @@ -0,0 +1 @@ +the next tier. Why use the paid tier? When you enable billing and use the paid tier, you benefit from higher rate limits , and your prompts and responses aren't used to improve Google products. For more information on data use for paid services, see the terms of service . Cloud Billing The Gemini API uses Cloud Billing for billing services. To use the paid tier, you must set up Cloud Billing on your cloud project. After you've enabled Cloud Billing, you can use Cloud Billing tools to track spending, understand costs, make payments, and access Cloud Billing support. Enable billing You can enable Cloud Billing starting from Google AI Studio: Open Google AI Studio . In the bottom of the left sidebar, select Settings > Plan information . Click Set up Billing for your chosen project to enable Cloud Billing. Monitor usage After you enable Cloud Billing, you can monitor your usage of the Gemini API in the Google Cloud console . The service name for the API is generativelanguage.googleapis.com , and in the console the Gemini API is also referred to as the Generative Language API . To learn more, see the Google Cloud documentation on monitoring API usage . Frequently asked questions This section provides answers to frequently asked questions. What am I billed for? Gemini API pricing is based on the following: Input token count Output token count Cached token count Cached token storage duration For pricing information, see the pricing page . Where can I view my quota? You can view your quota and system limits in the Google Cloud console . How do I request more quota? To request more quota, follow the instructions at How to request an upgrade . Can I use the Gemini API for free in EEA (including EU), the UK, and CH? Yes, we make the free tier and paid tier available in many regions . If I set up billing with the Gemini API, will I be charged for my Google AI Studio usage? No, Google AI Studio usage remains free of charge regardless of if you set up billing across all supported \ No newline at end of file diff --git a/docstore/992bc50e-b3be-40e8-9e44-556dcc598fe7 b/docstore/992bc50e-b3be-40e8-9e44-556dcc598fe7 new file mode 100644 index 0000000000000000000000000000000000000000..5a2b8a2c7253b87796a50aae4616794fa40b7018 --- /dev/null +++ b/docstore/992bc50e-b3be-40e8-9e44-556dcc598fe7 @@ -0,0 +1 @@ +( 'Opened' ); }, onmessage : function ( message ) { console . debug ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send content... session . close (); } main (); Note: You can only set one modality in the response_modalities field. This means that you can configure the model to respond with either text or audio, but not both in the same session. Interaction modalities The following sections provide examples and supporting context for the different input and output modalities available in Live API. Sending and receiving text Here's how you can send and receive text: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello, how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . text is not None : print ( response . text , end = "" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . \ No newline at end of file diff --git a/docstore/99537c0d-abbb-4094-9d0a-6bdd64284229 b/docstore/99537c0d-abbb-4094-9d0a-6bdd64284229 new file mode 100644 index 0000000000000000000000000000000000000000..3d93e98c2388d03b02165eba8a61d6ed193d8415 --- /dev/null +++ b/docstore/99537c0d-abbb-4094-9d0a-6bdd64284229 @@ -0,0 +1 @@ +Prompt design strategies | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Prompt design strategies Prompt design is the process of creating prompts, or natural language requests, that elicit accurate, high quality responses from a language model. This page introduces basic concepts, strategies, and best practices to get you started designing prompts to get the most out of Gemini AI models. Topic-specific prompt guides Looking for more specific prompt strategies? Check out our other prompting guides on: Prompting with media files Prompting for image generation Prompting for video generation Google AI Studio also hosts a prompt gallery meant to interactively showcase many of the concepts shared in this guide. Clear and specific instructions An effective and efficient way to customize model behavior is to provide it with clear and specific instructions. Instructions can be in the form of a question, step-by-step tasks, or as complex as mapping out a user's experience and mindset. Input Input is the required text in the prompt that you want the model to provide a response to. Inputs can be a question that the model answers (question input), a task the model performs (task input), an entity the model operates on (entity input), or partial input that the model completes or continues (completion input). Input type Prompt Generated output Question What's a good name for a flower shop that specializes in selling bouquets of dried flowers? Create a list of 5 options with just the names. Here are 10 names for a flower shop specializing in dried flowers: 1. Everlasting Blooms 2. Dried & Delightful 3. The Petal Preserve 4. Whispers of Wildflowers \ No newline at end of file diff --git a/docstore/995731a5-0cce-4532-96c4-4013b977b3fd b/docstore/995731a5-0cce-4532-96c4-4013b977b3fd new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/995731a5-0cce-4532-96c4-4013b977b3fd @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/9957a200-71bf-4cf8-a56d-16fb59299ac3 b/docstore/9957a200-71bf-4cf8-a56d-16fb59299ac3 new file mode 100644 index 0000000000000000000000000000000000000000..cc967ffee72bfb803a27c22b45a7304a203928ac --- /dev/null +++ b/docstore/9957a200-71bf-4cf8-a56d-16fb59299ac3 @@ -0,0 +1 @@ +the model to parse the problem before answering the prompt. Prompt Response Parse the formula in the image first. Then based on the formula, answer "what's the 4th term in the sequence?". Click to expand the result $$ b(n) = b(n-1) * (-3) $$ $$ b(1) = 15 $$ Substitute n = 4 into the formula: $$ b(4) = b(4-1) * (-3) $$ $$ b(4) = b(3) * (-3) $$ We don't know the value of b(3), so substitute n = 3 into the formula: $$ b(3) = b(3-1) * (-3) $$ $$ b(3) = b(2) * (-3) $$ We don't know the value of b(2), so substitute n = 2 into the formula: $$ b(2) = b(2-1) * (-3) $$ $$ b(2) = b(1) * (-3) $$ We know the value of b(1) is 15, so substitute it into the formula: $$ b(2) = 15 * (-3) $$ $$ b(2) = -45 $$ Now that we know the value of b(2), substitute it into the formula for b(3): $$ b(3) = b(2) * (-3) $$ $$ b(3) = -45 * (-3) $$ $$ b(3) = 135 $$ Now that we know the value of b(3), substitute it into the formula for b(4): $$ b(4) = b(3) * (-3) $$ $$ b(4) = 135 * (-3) $$ $$ b(4) = -405 $$ Therefore, the 4th term in the sequence is -405. This model response is correct - now to make the response even more streamlined, you can ask the model to to think step-by-step. Updated prompt Improved response What is the 4th term in the sequence? Think step by step. The first term is 15. The second term is b(1) * (-3) = 15 * (-3) = -45. The third term is b(2) * (-3) = -45 * (-3) = 135. The fourth term is b(3) * (-3) = 135 * (-3) = -405. Specify the output format A common problem is the need for model output to be in a certain format (e.g. markdown, JSON, HTML), especially if the model output needs to be ingested by a downstream task. You can try to produce output in that format by instructing the model to do so within the prompt. Below are two examples: Prompt Model response Parse the table in this image into markdown format | Subject | Status | Time | Start date | Due on | | ------- | ------ | ---- | ---------- | ------ | | Math | Done | 1.5 hrs | 11/29/2023 | 12/1/2023 | | Geography | Done | 30 \ No newline at end of file diff --git a/docstore/99690526-1b09-42ee-8aa4-cf84495ff8d8 b/docstore/99690526-1b09-42ee-8aa4-cf84495ff8d8 new file mode 100644 index 0000000000000000000000000000000000000000..47c113cd9bacf84a434d531b6a240f0025b74130 --- /dev/null +++ b/docstore/99690526-1b09-42ee-8aa4-cf84495ff8d8 @@ -0,0 +1 @@ +. For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC. \ No newline at end of file diff --git a/docstore/996f8b7d-ee85-4622-b4cf-6cc0b4ee0194 b/docstore/996f8b7d-ee85-4622-b4cf-6cc0b4ee0194 new file mode 100644 index 0000000000000000000000000000000000000000..d11771732a04c7d26188d06a054298ac8420471a --- /dev/null +++ b/docstore/996f8b7d-ee85-4622-b4cf-6cc0b4ee0194 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#text-embedding Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/997e259c-0138-4546-aaf8-f91d88e88012 b/docstore/997e259c-0138-4546-aaf8-f91d88e88012 new file mode 100644 index 0000000000000000000000000000000000000000..5517c0bf6b0252fb1080acbdd22b2b409d2663d8 --- /dev/null +++ b/docstore/997e259c-0138-4546-aaf8-f91d88e88012 @@ -0,0 +1 @@ +signatures, function call and result of the function execution to contents function_call_content = response . candidates [ 0 ] . content # Append the model's function call message, which includes thought signatures contents . append ( function_call_content ) contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Step 4: Create user friendly response with function result and call the model again // ...Create a function response part (No change) // Append thought signatures, function call and result of the function execution to contents const function_response_content = response . candidates [ 0 ]. content ; contents . push ( function_response_content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); The following shows what a request returning a thought signature may look like: [{ "contents" : [ { "role" : "user" , "parts" : [ { "text" : "what is the weather in Lake Tahoe?" } ] } , { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/9984dcbe-8171-4cea-950f-be5849f1c092 b/docstore/9984dcbe-8171-4cea-950f-be5849f1c092 new file mode 100644 index 0000000000000000000000000000000000000000..a70be0d04647e3405f67cb87b3156d8922b7c210 --- /dev/null +++ b/docstore/9984dcbe-8171-4cea-950f-be5849f1c092 @@ -0,0 +1 @@ +to the examples provides labels that the model can use when generating the output, which makes it easier to parse output content. In the following example, "Text:" is the input prefix and "The answer is:" is the output prefix. Prompt: Classify the text as one of the following categories. - large - small Text: Rhino The answer is: large Text: Mouse The answer is: small Text: Snail The answer is: small Text: Elephant The answer is: Response: The answer is: large (gemini-2.5-flash) Break down prompts into components For use cases that require complex prompts, you can help the model manage this complexity by breaking things down into simpler components. Break down instructions: Instead of having many instructions in one prompt, create one prompt per instruction. You can choose which prompt to process based on the user's input. Chain prompts: For complex tasks that involve multiple sequential steps, make each step a prompt and chain the prompts together in a sequence. In this sequential chain of prompts, the output of one prompt in the sequence becomes the input of the next prompt. The output of the last prompt in the sequence is the final output. Aggregate responses: Aggregation is when you want to perform different parallel tasks on different portions of the data and aggregate the results to produce the final output. For example, you can tell the model to perform one operation on the first part of the data, perform another operation on the rest of the data and aggregate the results. Experiment with model parameters Each call that you send to a model includes parameter values that control how the model generates a response. The model can generate different results for different parameter values. Experiment with different parameter values to get the best values for the task. The parameters available for different models may differ. The most common parameters are the following: Max output tokens: Specifies the maximum number of tokens that can be generated in the \ No newline at end of file diff --git a/docstore/9999fecf-71ac-404b-a267-024b154f4b7f b/docstore/9999fecf-71ac-404b-a267-024b154f4b7f new file mode 100644 index 0000000000000000000000000000000000000000..46dc106c387700742db50f2912cf28b003e737e9 --- /dev/null +++ b/docstore/9999fecf-71ac-404b-a267-024b154f4b7f @@ -0,0 +1 @@ +ClientConfig { APIKey : " YOUR_API_KEY " , Backend : genai . BackendGeminiAPI , }) if err != nil { log . Fatal ( err ) } result , err := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) if err != nil { log . Fatal ( err ) } fmt . Println ( result . Text ()) } Java package com.example ; import com.google.genai.Client ; import com.google.genai.types.GenerateContentResponse ; public class GenerateTextFromTextInput { public static void main ( String [] args ) { Client client = Client . builder (). apiKey ( " YOUR_API_KEY " ). build (); GenerateContentResponse response = client . models . generateContent ( "gemini-2.5-flash" , "Explain how AI works in a few words" , null ); System . out . println ( response . text ()); } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key= $ YOUR_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works in a few words" } ] } ] }' Keep your API key secure Treat your Gemini API key like a password. If compromised, others can use your project's quota, incur charges (if billing is enabled), and access your private data, such as files. Critical security rules Never commit API keys to source control. Do not check your API key into version control systems like Git. Never expose API keys on the client-side. Do not use your API key directly in web or mobile apps in production. Keys in client-side code (including our JavaScript/TypeScript libraries and REST calls) can be extracted. Best practices Use server-side calls with API keys The most secure way to use your API key is to call the Gemini API from a server-side application where the key can be kept confidential. Use ephemeral tokens for client-side access (Live API only): For direct client-side access to the Live API, you can use ephemeral tokens. They come with lower security risks and can be \ No newline at end of file diff --git a/docstore/99bfc611-6e9c-4dec-aa4a-957d0c231949 b/docstore/99bfc611-6e9c-4dec-aa4a-957d0c231949 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/99bfc611-6e9c-4dec-aa4a-957d0c231949 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/99c80c04-89cf-4ae8-88ae-ec2397e556ef b/docstore/99c80c04-89cf-4ae8-88ae-ec2397e556ef new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/99c80c04-89cf-4ae8-88ae-ec2397e556ef @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/99e2092f-efa0-495e-a22d-84e7443d0b32 b/docstore/99e2092f-efa0-495e-a22d-84e7443d0b32 new file mode 100644 index 0000000000000000000000000000000000000000..c25aa08b43110ad03ba57bdce48865495fdfb941 --- /dev/null +++ b/docstore/99e2092f-efa0-495e-a22d-84e7443d0b32 @@ -0,0 +1 @@ +fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ], "generationConfig": { "thinkingConfig": { "thinkingBudget": 0 } } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } System instructions and other configurations You can guide the behavior of Gemini models with system instructions. To do so, pass a GenerateContentConfig object. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , config = types . GenerateContentConfig ( system_instruction = "You are a cat. Your name is Neko." ), contents = "Hello there" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Hello there" , config : { systemInstruction : "You are a cat. Your name is Neko." , }, }); console . log ( response . text ); } await main (); \ No newline at end of file diff --git a/docstore/9a219334-3888-4314-ad2a-126ec117c032 b/docstore/9a219334-3888-4314-ad2a-126ec117c032 new file mode 100644 index 0000000000000000000000000000000000000000..964387d039ef80cfbc57b203b39f125290e566d4 --- /dev/null +++ b/docstore/9a219334-3888-4314-ad2a-126ec117c032 @@ -0,0 +1 @@ +using the text below. Respond with only the text provided. Question: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Text: Color: Slowly pulsing yellow What it means: There is a network error. What to do: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. Color: Fast blinking yellow What it means: You are holding down the reset button and are factory resetting this device. What to do: If you keep holding down the reset button, after about 12 seconds, the light will turn solid yellow. Once it is solid yellow, let go of the factory reset button. Color: Solid yellow What it means: Router is factory resetting. What to do: This can take up to 10 minutes. When it's done, the device will reset itself and start pulsing white, letting you know it's ready for setup. Color: Solid red What it means: Something is wrong. What to do: Critical failure. Factory reset the router. If the light stays red, contact Wifi customer support. Response: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. (gemini-2.5-flash) Add prefixes A prefix is a word or phrase that you add to the prompt content that can serve several purposes, depending on where you put the prefix: Input prefix: Adding a prefix to the input signals semantically meaningful parts of the input to the model. For example, the prefixes "English:" and "French:" demarcate two different languages. Output prefix: Even though the output is generated by the model, you can add a prefix for the output in the prompt. The output prefix gives the model information about what's expected as a response. For example, the output prefix "JSON:" signals to the model that the output should be in JSON format. Example prefix: In few-shot prompts, adding prefixes \ No newline at end of file diff --git a/docstore/9a2e8bec-551c-4415-9ed8-8caabd617735 b/docstore/9a2e8bec-551c-4415-9ed8-8caabd617735 new file mode 100644 index 0000000000000000000000000000000000000000..cb7319bb995c708a315638a1177ba448f5c39210 --- /dev/null +++ b/docstore/9a2e8bec-551c-4415-9ed8-8caabd617735 @@ -0,0 +1 @@ +: "Powers the spinning disco ball." , "parameters" : { "type" : "object" , "properties" : { "power" : { "type" : "boolean" , "description" : "Whether to turn the disco ball on or off." , } }, "required" : [ "power" ], }, } start_music = { "name" : "start_music" , "description" : "Play some music matching the specified parameters." , "parameters" : { "type" : "object" , "properties" : { "energetic" : { "type" : "boolean" , "description" : "Whether the music is energetic or not." , }, "loud" : { "type" : "boolean" , "description" : "Whether the music is loud or not." , }, }, "required" : [ "energetic" , "loud" ], }, } dim_lights = { "name" : "dim_lights" , "description" : "Dim the lights." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "number" , "description" : "The brightness of the lights, 0.0 is off, 1.0 is full." , } }, "required" : [ "brightness" ], }, } JavaScript import { Type } from '@google/genai' ; const powerDiscoBall = { name : 'power_disco_ball' , description : 'Powers the spinning disco ball.' , parameters : { type : Type . OBJECT , properties : { power : { type : Type . BOOLEAN , description : 'Whether to turn the disco ball on or off.' } }, required : [ 'power' ] } }; const startMusic = { name : 'start_music' , description : 'Play some music matching the specified parameters.' , parameters : { type : Type . OBJECT , properties : { energetic : { type : Type . BOOLEAN , description : 'Whether the music is energetic or not.' }, loud : { type : Type . BOOLEAN , description : 'Whether the music is loud or not.' } }, required : [ 'energetic' , 'loud' ] } }; const dimLights = { name : 'dim_lights' , description : 'Dim the lights.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'The brightness of the lights, 0.0 is off, 1.0 is full.' } }, required : [ 'brightness' ] } }; Configure the function calling mode to allow using all of the specified tools. To learn more, \ No newline at end of file diff --git a/docstore/9a3582be-e1da-4ba6-85f3-c68bad074257 b/docstore/9a3582be-e1da-4ba6-85f3-c68bad074257 new file mode 100644 index 0000000000000000000000000000000000000000..12eaa70edcc32ba40cc62f56233fa0adea94b474 --- /dev/null +++ b/docstore/9a3582be-e1da-4ba6-85f3-c68bad074257 @@ -0,0 +1 @@ +Gemini API libraries | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini API libraries When building with the Gemini API, we recommend using our official collection of libraries across major languages: the Google GenAI SDK . They are production ready under General Availability . Our samples and documentation across this site are built using these libraries. Note: If you're using one of our legacy libraries, we strongly recommend you migrate to the Google GenAI SDK. Review the legacy libraries section for more information. If you're new to the Gemini API, follow our quickstart guide to get started. Language support and installation The Google GenAI SDK is available for the Python, JavaScript/TypeScript, Go and Java languages. You can install each language's library using package managers, or visit their GitHub repos for further engagement: Python Library: google-genai GitHub Repository: googleapis/python-genai Installation: pip install google-genai JavaScript Library: @google/genai GitHub Repository: googleapis/js-genai Installation: npm install @google/genai Go Library: google.golang.org/genai GitHub Repository: googleapis/go-genai Installation: go get google.golang.org/genai Java Library: google-genai GitHub Repository: googleapis/java-genai Installation: If you're using Maven, add the following to your dependencies: com.google.genai google-genai 1.0.0 General availability We started rolling out the Google GenAI SDK in late 2024. As of May 2025, it reached General Availability (GA) across all supported platforms. This means \ No newline at end of file diff --git a/docstore/9a7c3789-511c-4a56-ad8d-144be2132ab7 b/docstore/9a7c3789-511c-4a56-ad8d-144be2132ab7 new file mode 100644 index 0000000000000000000000000000000000000000..e01fb45cbed3453bf1ca90f079ac96b78e0dc7ff --- /dev/null +++ b/docstore/9a7c3789-511c-4a56-ad8d-144be2132ab7 @@ -0,0 +1 @@ +Asynchronous function calling is only supported in half-cascade audio generation. Function calling executes sequentially by default, meaning execution pauses until the results of each function call are available. This ensures sequential processing, which means you won't be able to continue interacting with the model while the functions are being run. If you don't want to block the conversation, you can tell the model to run the functions asynchronously. To do so, you first need to add a behavior to the function definitions: Python # Non-blocking function definitions turn_on_the_lights = { "name" : "turn_on_the_lights" , "behavior" : "NON_BLOCKING" } # turn_on_the_lights will run asynchronously turn_off_the_lights = { "name" : "turn_off_the_lights" } # turn_off_the_lights will still pause all interactions with the model JavaScript import { GoogleGenAI , Modality , Behavior } from '@google/genai' ; // Non-blocking function definitions const turn_on_the_lights = { name : "turn_on_the_lights" , behavior : Behavior . NON_BLOCKING } // Blocking function definitions const turn_off_the_lights = { name : "turn_off_the_lights" } const tools = [{ functionDeclarations : [ turn_on_the_lights , turn_off_the_lights ] }] NON-BLOCKING ensures the function runs asynchronously while you can continue interacting with the model. Then you need to tell the model how to behave when it receives the FunctionResponse using the scheduling parameter. It can either: Interrupt what it's doing and tell you about the response it got right away ( scheduling="INTERRUPT" ), Wait until it's finished with what it's currently doing ( scheduling="WHEN_IDLE" ), Or do nothing and use that knowledge later on in the discussion ( scheduling="SILENT" ) Python # for a non-blocking function definition, apply scheduling in the function response: function_response = types . FunctionResponse ( id = fc . id , name = fc . name , response = { "result" : "ok" , "scheduling" : "INTERRUPT" # Can also be WHEN_IDLE or \ No newline at end of file diff --git a/docstore/9aa00404-5f1b-4544-8411-a7632acb64a6 b/docstore/9aa00404-5f1b-4544-8411-a7632acb64a6 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/9aa00404-5f1b-4544-8411-a7632acb64a6 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/9aa7b951-326c-4f7c-9628-58ae4ecf8307 b/docstore/9aa7b951-326c-4f7c-9628-58ae4ecf8307 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/9aa7b951-326c-4f7c-9628-58ae4ecf8307 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/9ab7dd58-c67d-4bec-ab9b-ba69ef2bdb18 b/docstore/9ab7dd58-c67d-4bec-ab9b-ba69ef2bdb18 new file mode 100644 index 0000000000000000000000000000000000000000..12eaa70edcc32ba40cc62f56233fa0adea94b474 --- /dev/null +++ b/docstore/9ab7dd58-c67d-4bec-ab9b-ba69ef2bdb18 @@ -0,0 +1 @@ +Gemini API libraries | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini API libraries When building with the Gemini API, we recommend using our official collection of libraries across major languages: the Google GenAI SDK . They are production ready under General Availability . Our samples and documentation across this site are built using these libraries. Note: If you're using one of our legacy libraries, we strongly recommend you migrate to the Google GenAI SDK. Review the legacy libraries section for more information. If you're new to the Gemini API, follow our quickstart guide to get started. Language support and installation The Google GenAI SDK is available for the Python, JavaScript/TypeScript, Go and Java languages. You can install each language's library using package managers, or visit their GitHub repos for further engagement: Python Library: google-genai GitHub Repository: googleapis/python-genai Installation: pip install google-genai JavaScript Library: @google/genai GitHub Repository: googleapis/js-genai Installation: npm install @google/genai Go Library: google.golang.org/genai GitHub Repository: googleapis/go-genai Installation: go get google.golang.org/genai Java Library: google-genai GitHub Repository: googleapis/java-genai Installation: If you're using Maven, add the following to your dependencies: com.google.genai google-genai 1.0.0 General availability We started rolling out the Google GenAI SDK in late 2024. As of May 2025, it reached General Availability (GA) across all supported platforms. This means \ No newline at end of file diff --git a/docstore/9ad5c753-f17f-4e5b-b790-6463b7e9057a b/docstore/9ad5c753-f17f-4e5b-b790-6463b7e9057a new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/9ad5c753-f17f-4e5b-b790-6463b7e9057a @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/9af126ba-4703-4bcb-b8b5-cf0af83aa5df b/docstore/9af126ba-4703-4bcb-b8b5-cf0af83aa5df new file mode 100644 index 0000000000000000000000000000000000000000..bbc4019685cdf16085ca79e5df30b3ebeb71657f --- /dev/null +++ b/docstore/9af126ba-4703-4bcb-b8b5-cf0af83aa5df @@ -0,0 +1 @@ +"role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . server_content . model_turn : print ( "Model turn:" , response . server_content . model_turn ) if response . server_content . output_transcription : print ( "Transcript:" , response . server_content . output_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ], outputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . debug ( 'Received output transcription: %s\n' , turn . serverContent . outputTranscription . text ); } } session . close (); } async function \ No newline at end of file diff --git a/docstore/9afa2ebe-3d30-41be-b91c-653f8ccb795d b/docstore/9afa2ebe-3d30-41be-b91c-653f8ccb795d new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/9afa2ebe-3d30-41be-b91c-653f8ccb795d @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/9b119c2d-91cb-42e4-a0c8-f4f2e4a3819b b/docstore/9b119c2d-91cb-42e4-a0c8-f4f2e4a3819b new file mode 100644 index 0000000000000000000000000000000000000000..f1254740842b565388da4397f822e545092a5b88 --- /dev/null +++ b/docstore/9b119c2d-91cb-42e4-a0c8-f4f2e4a3819b @@ -0,0 +1 @@ +NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) stream := chat . SendMessageStream ( ctx , genai . Part { Text : "How many paws are in my house?" }) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt = sse \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Supported models All models in the Gemini family support text generation. To learn more about the models \ No newline at end of file diff --git a/docstore/9b3975d0-fe35-4922-8fd2-a086e31ad9a2 b/docstore/9b3975d0-fe35-4922-8fd2-a086e31ad9a2 new file mode 100644 index 0000000000000000000000000000000000000000..160f3e6856b7e1adae02c42007e260c002e5430c --- /dev/null +++ b/docstore/9b3975d0-fe35-4922-8fd2-a086e31ad9a2 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/batch-mode#inline-requests Title: Batch Mode | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/9b440408-087c-40fb-8d30-d913388cdf3c b/docstore/9b440408-087c-40fb-8d30-d913388cdf3c new file mode 100644 index 0000000000000000000000000000000000000000..9bdceff10febb2c20e655c9c51c8df9e3aef858d --- /dev/null +++ b/docstore/9b440408-087c-40fb-8d30-d913388cdf3c @@ -0,0 +1 @@ +Using different words or phrasing in your prompts often yields different responses from the model even though they all mean the same thing. If you're not getting the expected results from your prompt, try rephrasing it. Version 1: How do I bake a pie? Version 2: Suggest a recipe for a pie. Version 3: What's a good pie recipe? Switch to an analogous task: If you can't get the model to follow your instructions for a task, try giving it instructions for an analogous task that achieves the same result. This prompt tells the model to categorize a book by using predefined categories: Prompt: Which category does The Odyssey belong to: thriller sci-fi mythology biography Response: The Odyssey belongs to the category of **mythology**. Here's why: * **Mythology:** The Odyssey tells the story of Odysseus, a hero from Greek mythology, and his journey home after the Trojan War. It features gods, monsters, and supernatural events common to Greek mythology. ..... (gemini-2.5-flash) The response is correct, but the model didn't stay within the bounds of the options. You also want to model to just respond with one of the options instead of in a full sentence. In this case, you can rephrase the instructions as a multiple choice question and ask the model to choose an option. Prompt: Multiple choice problem: Which of the following options describes the book The Odyssey? Options: thriller sci-fi mythology biography Response: The correct answer is mythology . (gemini-2.5-flash) Change the order of prompt content: The order of the content in the prompt can sometimes affect the response. Try changing the content order and see how that affects the response. Version 1 : [ examples ] [ context ] [ input ] Version 2 : [ input ] [ examples ] [ context ] Version 3 : [ examples ] [ input ] [ context ] Fallback responses A fallback response is a response returned by the model when either the prompt or the response triggers a safety filter. An example of a fallback response is "I'm not able to \ No newline at end of file diff --git a/docstore/9b51cb46-a02f-4036-b7bf-a07698304729 b/docstore/9b51cb46-a02f-4036-b7bf-a07698304729 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/9b51cb46-a02f-4036-b7bf-a07698304729 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/9b68a741-50b4-4e83-bff7-5c6a821977a3 b/docstore/9b68a741-50b4-4e83-bff7-5c6a821977a3 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/9b68a741-50b4-4e83-bff7-5c6a821977a3 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/9b82ed51-b091-4914-9c5f-24c25da538df b/docstore/9b82ed51-b091-4914-9c5f-24c25da538df new file mode 100644 index 0000000000000000000000000000000000000000..6d0fb3dda30537d366b10ce141412a20984aa796 --- /dev/null +++ b/docstore/9b82ed51-b091-4914-9c5f-24c25da538df @@ -0,0 +1 @@ +where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels. """ config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # set thinking_budget to 0 for better results in object detection ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ prompt , im ], # Pillow images can be directly passed as inputs (which will be converted by the SDK) config = config ) # Parse JSON response items = json . loads ( parse_json ( response . text )) # Create output directory os . makedirs ( output_dir , exist_ok = True ) # Process each mask for i , item in enumerate ( items ): # Get bounding box coordinates box = item [ "box_2d" ] y0 = int ( box [ 0 ] / 1000 * im . size [ 1 ]) x0 = int ( box [ 1 ] / 1000 * im . size [ 0 ]) y1 = int ( box [ 2 ] / 1000 * im . size [ 1 ]) x1 = int ( box [ 3 ] / 1000 * im . size [ 0 ]) # Skip invalid boxes if y0 > = y1 or x0 > = x1 : continue # Process mask png_str = item [ "mask" ] if not png_str . startswith ( "data:image/png;base64," ): continue # Remove prefix png_str = png_str . removeprefix ( "data:image/png;base64," ) mask_data = base64 . b64decode ( png_str ) mask = Image . open ( io . BytesIO ( mask_data )) # Resize mask to match bounding box mask = mask . resize (( x1 - x0 , y1 - y0 ), Image . Resampling . BILINEAR ) # Convert mask to numpy array for processing mask_array = np . array ( mask ) # Create overlay for this mask overlay = Image . new ( 'RGBA' , im . size , ( 0 , 0 , 0 , 0 )) overlay_draw = ImageDraw . Draw ( overlay ) # Create overlay for the mask color = ( 255 , 255 , 255 , 200 ) for y in range ( y0 , y1 ): for x in range ( x0 , x1 ): if mask_array [ y - y0 , x - x0 ] > 128 : # Threshold for mask overlay_draw . point (( x , y ), fill = color ) # Save individual mask and its overlay mask_filename = f " { item [ 'label' ] } _ { i } _mask.png" \ No newline at end of file diff --git a/docstore/9bce429c-f940-4372-87d2-baa40f342db2 b/docstore/9bce429c-f940-4372-87d2-baa40f342db2 new file mode 100644 index 0000000000000000000000000000000000000000..ba9fb868f99e81b20779165b803150afffeabaec --- /dev/null +++ b/docstore/9bce429c-f940-4372-87d2-baa40f342db2 @@ -0,0 +1 @@ +Generate video using Veo | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Generate video using Veo The Gemini API provides access to Veo 2 , Google's most capable video generation model to date. Veo generates videos in a wide range of cinematic and visual styles, capturing prompt nuance to render intricate details consistently across frames. This guide will help you get started with Veo using the Gemini API. For video prompting guidance, check out the Veo prompt guide section. Note: Veo is a paid feature and will not run in the Free tier. Visit the Pricing page for more details. Before you begin Before calling the Gemini API, ensure you have your SDK of choice installed, and a Gemini API key configured and ready to use. To use Veo with the Google Gen AI SDKs, ensure that you have one of the following versions installed: Python v1.10.0 or later TypeScript and JavaScript v0.8.0 or later Go v1.0.0 or later Generate videos This section provides code examples for generating videos using text prompts and using images . Generate from text You can use the following code to generate videos with Veo: Python import time from google import genai from google.genai import types client = genai . Client () operation = client . models . generate_videos ( model = "veo-2.0-generate-001" , prompt = "Panning wide shot of a calico kitten sleeping in the sunshine" , config = types . GenerateVideosConfig ( person_generation = "dont_allow" , # "dont_allow" or "allow_adult" aspect_ratio = "16:9" , # "16:9" or "9:16" ), ) while not operation . done : time . sleep ( 20 ) operation = client . operations . get ( operation ) for n , generated_video in enumerate ( \ No newline at end of file diff --git a/docstore/9bcf3a8f-6204-472f-8614-f6ba40cf9046 b/docstore/9bcf3a8f-6204-472f-8614-f6ba40cf9046 new file mode 100644 index 0000000000000000000000000000000000000000..f2b17a9e75cd68be6927ebbebeaa9599b23b6033 --- /dev/null +++ b/docstore/9bcf3a8f-6204-472f-8614-f6ba40cf9046 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/migrate-to-cloud Title: Gemini Developer API v.s. Vertex AI | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/9bd86941-242b-46c6-aa2e-27454fb294bd b/docstore/9bd86941-242b-46c6-aa2e-27454fb294bd new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/9bd86941-242b-46c6-aa2e-27454fb294bd @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/9c050cc9-3568-4359-932a-2ebce126e140 b/docstore/9c050cc9-3568-4359-932a-2ebce126e140 new file mode 100644 index 0000000000000000000000000000000000000000..695625c4874909c2b3cfd8b5c8824970cf01837a --- /dev/null +++ b/docstore/9c050cc9-3568-4359-932a-2ebce126e140 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/function-calling#multi-tool-use Title: Function calling with the Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/9c06775e-4cdc-4453-bd36-56c0b45ac68e b/docstore/9c06775e-4cdc-4453-bd36-56c0b45ac68e new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/9c06775e-4cdc-4453-bd36-56c0b45ac68e @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/9c0c4d08-3fc1-4d14-8d5d-3dce5e55250c b/docstore/9c0c4d08-3fc1-4d14-8d5d-3dce5e55250c new file mode 100644 index 0000000000000000000000000000000000000000..ba28dbc7ce7bec083cb930373e6fd594666aa933 --- /dev/null +++ b/docstore/9c0c4d08-3fc1-4d14-8d5d-3dce5e55250c @@ -0,0 +1 @@ +my house?" , }); console . log ( "Chat response 2:" , response2 . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) res , _ := chat . SendMessage ( ctx , genai . Part { Text : "How many paws are in my house?" }) if len ( res . Candidates ) > 0 { fmt . Println ( res . Candidates [ 0 ]. Content . Parts [ 0 ]. Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' \ No newline at end of file diff --git a/docstore/9c1eabdf-a0b1-4bcb-83bf-1d98cab6dd35 b/docstore/9c1eabdf-a0b1-4bcb-83bf-1d98cab6dd35 new file mode 100644 index 0000000000000000000000000000000000000000..de98b41ec31106077167d65dc0d83dfd4822d872 --- /dev/null +++ b/docstore/9c1eabdf-a0b1-4bcb-83bf-1d98cab6dd35 @@ -0,0 +1 @@ +moment. The run() function declaration, which handles the asynchronous websocket setup, is omitted for brevity. Python # Multiple tasks example - combining lights, code execution, and search prompt = """ Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! """ tools = [ { 'google_search' : {}}, { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} # not defined here. ] # Execute the prompt with specified tools in audio modality await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Multiple tasks example - combining lights, code execution, and search const prompt = ` Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! ` ; const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } // not defined here. ]; // Execute the prompt with specified tools in audio modality await run ( prompt , { tools : tools , modality : "AUDIO" }); Python developers can try this out in the Live API Tool Use notebook . Model context protocol (MCP) Model Context Protocol (MCP) is an open standard for connecting AI applications with external tools and data. MCP provides a common protocol for models to access context, such as functions (tools), data sources (resources), or predefined prompts. The Gemini SDKs have built-in support for the MCP, reducing boilerplate code and offering automatic tool calling for MCP tools. When the model generates an MCP tool call, the Python and JavaScript client SDK can automatically execute the MCP tool and send the \ No newline at end of file diff --git a/docstore/9c28bc6e-3a3c-44a3-be8d-03565e10a71e b/docstore/9c28bc6e-3a3c-44a3-be8d-03565e10a71e new file mode 100644 index 0000000000000000000000000000000000000000..ee5c813c791d5a83a4aaebfc95bbc6cde222bca1 --- /dev/null +++ b/docstore/9c28bc6e-3a3c-44a3-be8d-03565e10a71e @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/image-generation#imagen Title: Image generation | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/9c82a56b-8a2e-41b5-b414-efe3d07d64cb b/docstore/9c82a56b-8a2e-41b5-b414-efe3d07d64cb new file mode 100644 index 0000000000000000000000000000000000000000..fc6b27785ffc2bef06b5f2067401216e3ff34ff1 --- /dev/null +++ b/docstore/9c82a56b-8a2e-41b5-b414-efe3d07d64cb @@ -0,0 +1 @@ +"cheeseburger" was excluded from the output because it wasn't a part of the order. Constraints Specify any constraints on reading the prompt or generating a response. You can tell the model what to do and not to do. For example, you can specify a constraint in the prompt on how long you want a summary to be: Prompt: Summarize this text in one sentence: Text: A quantum computer exploits quantum mechanical phenomena to perform calculations exponentially faster than any modern traditional computer. At very tiny scales, physical matter acts as both particles and as waves, and quantum computing uses specialized hardware to leverage this behavior. The operating principles of quantum devices is beyond the scope of classical physics. When deployed at scale, quantum computers could be used in a wide variety of applications such as: in cybersecurity to break existing encryption methods while helping researchers create new ones, in meteorology to develop better weather forecasting etc. However, the current state of the art quantum computers are still largely experimental and impractical. Response: Exploiting quantum mechanical phenomena, quantum computers can perform calculations exponentially faster than traditional computers for potential applications like cybersecurity and meteorology, although they are currently largely experimental and impractical. (gemini-2.5-flash) Response format You can give instructions that specify the format of the response. For example, you can ask for the response to be formatted as a table, bulleted list, elevator pitch, keywords, sentence, or paragraph. The following system instruction tells the model to be more conversational in its response: System instruction All questions should be answered comprehensively with details, unless the user requests a concise response specifically. Prompt What is a smart way to make a business that sells DVD's in 2025? Response: Starting a business selling DVDs in 2025 presents significant challenges due to the \ No newline at end of file diff --git a/docstore/9c83885f-b8a6-4a8a-a2aa-27fddcba42dd b/docstore/9c83885f-b8a6-4a8a-a2aa-27fddcba42dd new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/9c83885f-b8a6-4a8a-a2aa-27fddcba42dd @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/9c9cf291-49d2-4798-8c5a-50c147d7d0e5 b/docstore/9c9cf291-49d2-4798-8c5a-50c147d7d0e5 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/9c9cf291-49d2-4798-8c5a-50c147d7d0e5 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/9cb76ca2-8f10-4717-bcf9-6eb3b16bad99 b/docstore/9cb76ca2-8f10-4717-bcf9-6eb3b16bad99 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/9cb76ca2-8f10-4717-bcf9-6eb3b16bad99 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/9cd63b84-53c6-4d11-af50-7784a9b96818 b/docstore/9cd63b84-53c6-4d11-af50-7784a9b96818 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/9cd63b84-53c6-4d11-af50-7784a9b96818 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/9d00027f-04af-4970-8519-decfac1e5551 b/docstore/9d00027f-04af-4970-8519-decfac1e5551 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/9d00027f-04af-4970-8519-decfac1e5551 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/9d01daf9-8e68-4a8a-b0e3-519332864a16 b/docstore/9d01daf9-8e68-4a8a-b0e3-519332864a16 new file mode 100644 index 0000000000000000000000000000000000000000..53e5ed0c4b3c9d5f8d129df24753928921198efa --- /dev/null +++ b/docstore/9d01daf9-8e68-4a8a-b0e3-519332864a16 @@ -0,0 +1 @@ +text ); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -X POST \ -d '{ "contents": [ { "parts": [ {"text": "Who won the euro 2024?"} ] } ], "tools": [ { "google_search": {} } ] }' You can learn more by trying the Search tool notebook . How grounding with Google Search works When you enable the google_search tool, the model handles the entire workflow of searching, processing, and citing information automatically. User Prompt: Your application sends a user's prompt to the Gemini API with the google_search tool enabled. Prompt Analysis: The model analyzes the prompt and determines if a Google Search can improve the answer. Google Search: If needed, the model automatically generates one or multiple search queries and executes them. Search Results Processing: The model processes the search results, synthesizes the information, and formulates a response. Grounded Response: The API returns a final, user-friendly response that is grounded in the search results. This response includes the model's text answer and groundingMetadata with the search queries, web results, and citations. Understanding the Grounding Response When a response is successfully grounded, the response includes a groundingMetadata field. This structured data is essential for verifying claims and building a rich citation experience in your application. { "candidates" : [ { "content" : { "parts" : [ { "text" : "Spain won Euro 2024, defeating England 2-1 in the final. This victory marks Spain's record fourth European Championship title." } ], "role" : "model" }, "groundingMetadata" : { "webSearchQueries" : [ "UEFA Euro 2024 winner" , "who won euro 2024" ], "searchEntryPoint" : { "renderedContent" : "" }, "groundingChunks" : [ { "web" : { "uri" : "https://vertexaisearch.cloud.google.com....." , "title" : "aljazeera.com" }}, { "web" : \ No newline at end of file diff --git a/docstore/9d01e2c6-1c0d-4800-9503-316797a03463 b/docstore/9d01e2c6-1c0d-4800-9503-316797a03463 new file mode 100644 index 0000000000000000000000000000000000000000..4a588ba82271aaa08f2df12295049c23bd57f7eb --- /dev/null +++ b/docstore/9d01e2c6-1c0d-4800-9503-316797a03463 @@ -0,0 +1 @@ +, ) print ( response . text ) # The SDK handles the function call and returns the final text You can disable automatic function calling with: Python config = types . GenerateContentConfig ( tools = [ get_current_temperature ], automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ) ) Automatic function schema declaration Automatic schema extraction from Python functions doesn't work in all cases. For example, it doesn't handle cases where you describe the fields of a nested dictionary-object. The API is able to describe any of the following types: Python AllowedType = ( int | float | bool | str | list [ 'AllowedType' ] | dict [ str , AllowedType ]) To see what the inferred schema looks like, you can convert it using from_callable : Python def multiply ( a : float , b : float ): """Returns a * b.""" return a * b fn_decl = types . FunctionDeclaration . from_callable ( callable = multiply , client = client ) # to_json_dict() provides a clean JSON representation. print ( fn_decl . to_json_dict ()) Multi-tool use: Combine native tools with function calling You can enable multiple tools combining native tools with function calling at the same time. Here's an example that enables two tools, Grounding with Google Search and code execution , in a request using the Live API . Note: Multi-tool use is a- Live API only feature at the moment. The run() function declaration, which handles the asynchronous websocket setup, is omitted for brevity. Python # Multiple tasks example - combining lights, code execution, and search prompt = """ Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! """ tools = [ { 'google_search' : {}}, { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} # not defined here. \ No newline at end of file diff --git a/docstore/9d0461a6-899e-4dca-b33b-9e0805fb2efb b/docstore/9d0461a6-899e-4dca-b33b-9e0805fb2efb new file mode 100644 index 0000000000000000000000000000000000000000..980cad742ce4bdad224c6b76fd35613451194dd7 --- /dev/null +++ b/docstore/9d0461a6-899e-4dca-b33b-9e0805fb2efb @@ -0,0 +1 @@ +parameters : { type : Type . OBJECT , properties : { energetic : { type : Type . BOOLEAN , description : 'Whether the music is energetic or not.' }, loud : { type : Type . BOOLEAN , description : 'Whether the music is loud or not.' } }, required : [ 'energetic' , 'loud' ] } }; const dimLights = { name : 'dim_lights' , description : 'Dim the lights.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'The brightness of the lights, 0.0 is off, 1.0 is full.' } }, required : [ 'brightness' ] } }; Configure the function calling mode to allow using all of the specified tools. To learn more, you can read about configuring function calling . Python from google import genai from google.genai import types # Configure the client and tools client = genai . Client () house_tools = [ types . Tool ( function_declarations = [ power_disco_ball , start_music , dim_lights ]) ] config = types . GenerateContentConfig ( tools = house_tools , automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ), # Force the model to call 'any' function, instead of chatting. tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = 'ANY' ) ), ) chat = client . chats . create ( model = "gemini-2.5-flash" , config = config ) response = chat . send_message ( "Turn this place into a party!" ) # Print out each of the function calls requested from this single call print ( "Example 1: Forced function calling" ) for fn in response . function_calls : args = ", " . join ( f " { key } = { val } " for key , val in fn . args . items ()) print ( f " { fn . name } ( { args } )" ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Set up function declarations const houseFns = [ powerDiscoBall , startMusic , dimLights ]; const config = { tools : [{ functionDeclarations : houseFns }], // Force the model to call 'any' function, instead of chatting. toolConfig : { functionCallingConfig : \ No newline at end of file diff --git a/docstore/9d0c4d9c-57f3-4783-a478-d8be98bc44d5 b/docstore/9d0c4d9c-57f3-4783-a478-d8be98bc44d5 new file mode 100644 index 0000000000000000000000000000000000000000..9bbc716abe48d525c0fa9d948b1c2978f423996b --- /dev/null +++ b/docstore/9d0c4d9c-57f3-4783-a478-d8be98bc44d5 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting_with_media?lang=python#prompting-with-videos Title: Files API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/9d325b8d-90c6-43a0-b030-71f9417234e2 b/docstore/9d325b8d-90c6-43a0-b030-71f9417234e2 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/9d325b8d-90c6-43a0-b030-71f9417234e2 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/9d3ff458-a4fa-4393-b466-b6a7b8a4845b b/docstore/9d3ff458-a4fa-4393-b466-b6a7b8a4845b new file mode 100644 index 0000000000000000000000000000000000000000..6d0fb3dda30537d366b10ce141412a20984aa796 --- /dev/null +++ b/docstore/9d3ff458-a4fa-4393-b466-b6a7b8a4845b @@ -0,0 +1 @@ +where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels. """ config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # set thinking_budget to 0 for better results in object detection ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ prompt , im ], # Pillow images can be directly passed as inputs (which will be converted by the SDK) config = config ) # Parse JSON response items = json . loads ( parse_json ( response . text )) # Create output directory os . makedirs ( output_dir , exist_ok = True ) # Process each mask for i , item in enumerate ( items ): # Get bounding box coordinates box = item [ "box_2d" ] y0 = int ( box [ 0 ] / 1000 * im . size [ 1 ]) x0 = int ( box [ 1 ] / 1000 * im . size [ 0 ]) y1 = int ( box [ 2 ] / 1000 * im . size [ 1 ]) x1 = int ( box [ 3 ] / 1000 * im . size [ 0 ]) # Skip invalid boxes if y0 > = y1 or x0 > = x1 : continue # Process mask png_str = item [ "mask" ] if not png_str . startswith ( "data:image/png;base64," ): continue # Remove prefix png_str = png_str . removeprefix ( "data:image/png;base64," ) mask_data = base64 . b64decode ( png_str ) mask = Image . open ( io . BytesIO ( mask_data )) # Resize mask to match bounding box mask = mask . resize (( x1 - x0 , y1 - y0 ), Image . Resampling . BILINEAR ) # Convert mask to numpy array for processing mask_array = np . array ( mask ) # Create overlay for this mask overlay = Image . new ( 'RGBA' , im . size , ( 0 , 0 , 0 , 0 )) overlay_draw = ImageDraw . Draw ( overlay ) # Create overlay for the mask color = ( 255 , 255 , 255 , 200 ) for y in range ( y0 , y1 ): for x in range ( x0 , x1 ): if mask_array [ y - y0 , x - x0 ] > 128 : # Threshold for mask overlay_draw . point (( x , y ), fill = color ) # Save individual mask and its overlay mask_filename = f " { item [ 'label' ] } _ { i } _mask.png" \ No newline at end of file diff --git a/docstore/9d48a5b0-2fbe-4ae1-a9d7-bfbc38ec27e1 b/docstore/9d48a5b0-2fbe-4ae1-a9d7-bfbc38ec27e1 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/9d48a5b0-2fbe-4ae1-a9d7-bfbc38ec27e1 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/9d58cbea-6b13-428b-9154-1e261b549767 b/docstore/9d58cbea-6b13-428b-9154-1e261b549767 new file mode 100644 index 0000000000000000000000000000000000000000..203c1e44cd8be8f0144d13a11f43fb822930ab15 --- /dev/null +++ b/docstore/9d58cbea-6b13-428b-9154-1e261b549767 @@ -0,0 +1 @@ +] // MCP Server }); const client = new Client ( { name : "example-client" , version : "1.0.0" } ); // Configure the client const ai = new GoogleGenAI ({}); // Initialize the connection between client and server await client . connect ( serverParams ); // Send request to the model with MCP tools const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : `What is the weather in London in ${ new Date (). toLocaleDateString () } ?` , config : { tools : [ mcpToTool ( client )], // uses the session, will automatically call the tool // Uncomment if you **don't** want the sdk to automatically call the tool // automaticFunctionCalling: { // disable: true, // }, }, }); console . log ( response . text ) // Close the connection await client . close (); Limitations with built-in MCP support Built-in MCP support is a experimental feature in our SDKs and has the following limitations: Only tools are supported, not resources nor prompts It is available for the Python and JavaScript/TypeScript SDK. Breaking changes might occur in future releases. Manual integration of MCP servers is always an option if these limit what you're building. Supported models This section lists models and their function calling capabilities. Experimental models are not included. You can find a comprehensive capabilities overview on the model overview page. Model Function Calling Parallel Function Calling Compositional Function Calling Gemini 2.5 Pro ✔️ ✔️ ✔️ Gemini 2.5 Flash ✔️ ✔️ ✔️ Gemini 2.5 Flash-Lite ✔️ ✔️ ✔️ Gemini 2.0 Flash ✔️ ✔️ ✔️ Gemini 2.0 Flash-Lite X X X Best practices Function and Parameter Descriptions: Be extremely clear and specific in your descriptions. The model relies on these to choose the correct function and provide appropriate arguments. Naming: Use descriptive function names (without spaces, periods, or dashes). Strong Typing: Use specific types (integer, string, enum) for parameters to reduce errors. If a parameter has a limited set of valid \ No newline at end of file diff --git a/docstore/9d954858-7708-4479-83b2-44c2aab8000b b/docstore/9d954858-7708-4479-83b2-44c2aab8000b new file mode 100644 index 0000000000000000000000000000000000000000..b23a8acc5f0d54a573ae6bf2c9ff53a2c6e1da77 --- /dev/null +++ b/docstore/9d954858-7708-4479-83b2-44c2aab8000b @@ -0,0 +1 @@ +Rate limits | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Rate limits Rate limits regulate the number of requests you can make to the Gemini API within a given timeframe. These limits help maintain fair usage, protect against abuse, and help maintain system performance for all users. How rate limits work Rate limits are usually measured across three dimensions: Requests per minute ( RPM ) Requests per day ( RPD ) Tokens per minute (input) ( TPM ) Your usage is evaluated against each limit, and exceeding any of them will trigger a rate limit error. For example, if your RPM limit is 20, making 21 requests within a minute will result in an error, even if you haven't exceeded your TPM or other limits. Rate limits are applied per project, not per API key. Limits vary depending on the specific model being used, and some limits only apply to specific models. For example, Images per minute, or IPM, is only calculated for models capable of generating images (Imagen 3), but is conceptually similar to TPM. Other models might have a token per day limit (TPD). Rate limits are more restricted for experimental and preview models. Usage tiers Rate limits are tied to the project's usage tier. As your API usage and spending increase, you'll have an option to upgrade to a higher tier with increased rate limits. Tier Qualifications Free Users in eligible countries Tier 1 Billing account linked to the project Tier 2 Total spend: > $250 and at least 30 days since successful payment Tier 3 Total spend: > $1,000 and at least 30 days since successful payment When you request an upgrade, our automated abuse protection system performs additional checks. \ No newline at end of file diff --git a/docstore/9da514ab-dd52-4a60-b4ab-dbe7244e901d b/docstore/9da514ab-dd52-4a60-b4ab-dbe7244e901d new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/9da514ab-dd52-4a60-b4ab-dbe7244e901d @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/9da87571-0c9e-489d-8c39-95a11d56695e b/docstore/9da87571-0c9e-489d-8c39-95a11d56695e new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/9da87571-0c9e-489d-8c39-95a11d56695e @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/9db3c5f6-6c71-4871-88a7-1c12118c3fd5 b/docstore/9db3c5f6-6c71-4871-88a7-1c12118c3fd5 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/9db3c5f6-6c71-4871-88a7-1c12118c3fd5 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/9db932bf-f5c2-41dd-9b66-aea6eebfc964 b/docstore/9db932bf-f5c2-41dd-9b66-aea6eebfc964 new file mode 100644 index 0000000000000000000000000000000000000000..38b0d511edf99b6e46520aefe0a8854155d4053f --- /dev/null +++ b/docstore/9db932bf-f5c2-41dd-9b66-aea6eebfc964 @@ -0,0 +1 @@ +energetic music, and dimmed the lights to 50% brightness. Let's get this party started! Compositional function calling Compositional or sequential function calling allows Gemini to chain multiple function calls together to fulfill a complex request. For example, to answer "Get the temperature in my current location", the Gemini API might first invoke a get_current_location() function followed by a get_weather() function that takes the location as a parameter. The following example demonstrates how to implement compositional function calling using the Python SDK and automatic function calling. Python This example uses the automatic function calling feature of the google-genai Python SDK. The SDK automatically converts the Python functions to the required schema, executes the function calls when requested by the model, and sends the results back to the model to complete the task. import os from google import genai from google.genai import types # Example Functions def get_weather_forecast ( location : str ) - > dict : """Gets the current weather temperature for a given location.""" print ( f "Tool Call: get_weather_forecast(location= { location } )" ) # TODO: Make API call print ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ) return { "temperature" : 25 , "unit" : "celsius" } # Dummy response def set_thermostat_temperature ( temperature : int ) - > dict : """Sets the thermostat to a desired temperature.""" print ( f "Tool Call: set_thermostat_temperature(temperature= { temperature } )" ) # TODO: Interact with a thermostat API print ( "Tool Response: {'status': 'success'}" ) return { "status" : "success" } # Configure the client and model client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_weather_forecast , set_thermostat_temperature ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to \ No newline at end of file diff --git a/docstore/9dbdf3ba-52a8-4f4d-99b2-52d64e345ed3 b/docstore/9dbdf3ba-52a8-4f4d-99b2-52d64e345ed3 new file mode 100644 index 0000000000000000000000000000000000000000..8562c6ca5d2a89dac90935227121a5fd486f1f09 --- /dev/null +++ b/docstore/9dbdf3ba-52a8-4f4d-99b2-52d64e345ed3 @@ -0,0 +1 @@ +establishing your core idea, and then refine and expand upon that core idea until the generated image is close to your vision. Prompt: A park in the spring next to a lake Prompt: A park in the spring next to a lake, the sun sets across the lake, golden hour Prompt: A park in the spring next to a lake, the sun sets across the lake, golden hour, red wildflowers Imagen models can transform your ideas into detailed images, whether your prompts are short or long and detailed. Refine your vision through iterative prompting, adding details until you achieve the perfect result. Short prompts let you generate an image quickly. Prompt: close-up photo of a woman in her 20s, street photography, movie still, muted orange warm tones Longer prompts let you add specific details and build your image. Prompt: captivating photo of a woman in her 20s utilizing a street photography style. The image should look like a movie still with muted orange warm tones. Additional advice for Imagen prompt writing: Use descriptive language : Employ detailed adjectives and adverbs to paint a clear picture for Imagen. Provide context : If necessary, include background information to aid the AI's understanding. Reference specific artists or styles : If you have a particular aesthetic in mind, referencing specific artists or art movements can be helpful. Use prompt engineering tools : Consider exploring prompt engineering tools or resources to help you refine your prompts and achieve optimal results. Enhancing the facial details in your personal and group images : Specify facial details as a focus of the photo (for example, use the word "portrait" in the prompt). Generate text in images Imagen models can add text into images, opening up more creative image generation possibilities. Use the following guidance to get the most out of this feature: Iterate with confidence : You might have to regenerate images until you achieve the look you want. Imagen's text integration is still evolving, and sometimes \ No newline at end of file diff --git a/docstore/9dc8797d-3ecb-4a48-ad1f-b11700ef74fe b/docstore/9dc8797d-3ecb-4a48-ad1f-b11700ef74fe new file mode 100644 index 0000000000000000000000000000000000000000..f518fb33367f92e6f7dff60abbf716e23f7823fa --- /dev/null +++ b/docstore/9dc8797d-3ecb-4a48-ad1f-b11700ef74fe @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/pricing Title: Gemini Developer API Pricing | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/9dddb5e4-ecae-4f55-a8fe-69ff15f40037 b/docstore/9dddb5e4-ecae-4f55-a8fe-69ff15f40037 new file mode 100644 index 0000000000000000000000000000000000000000..5b10a49a34afcc5006e0bf4f1bcb0c14355ae334 --- /dev/null +++ b/docstore/9dddb5e4-ecae-4f55-a8fe-69ff15f40037 @@ -0,0 +1 @@ +environment includes the following libraries: attrs chess contourpy fpdf geopandas imageio jinja2 joblib jsonschema jsonschema-specifications lxml matplotlib mpmath numpy opencv-python openpyxl packaging pandas pillow protobuf pylatex pyparsing PyPDF2 python-dateutil python-docx python-pptx reportlab scikit-learn scipy seaborn six striprtf sympy tabulate tensorflow toolz xlrd You can't install your own libraries. Note: Only matplotlib is supported for graph rendering using code execution. What's next Try the code execution Colab . Learn about other Gemini API tools: Function calling Grounding with Google Search Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/9e167737-36d9-4b50-ba33-6e535a0e1544 b/docstore/9e167737-36d9-4b50-ba33-6e535a0e1544 new file mode 100644 index 0000000000000000000000000000000000000000..44112a5590578bd0d81bd7d1053c465e40ca11dd --- /dev/null +++ b/docstore/9e167737-36d9-4b50-ba33-6e535a0e1544 @@ -0,0 +1 @@ +tokens for other Live API models Supported languages Live API supports the following languages. Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Language BCP-47 Code Language BCP-47 Code German (Germany) de-DE English (Australia)* en-AU English (UK)* en-GB English (India) en-IN English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Portuguese (Brazil) pt-BR Arabic (Generic) ar-XA Spanish (Spain)* es-ES French (Canada)* fr-CA Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Bengali (India) bn-IN Gujarati (India)* gu-IN Kannada (India)* kn-IN Marathi (India) mr-IN Malayalam (India)* ml-IN Tamil (India) ta-IN Telugu (India) te-IN Dutch (Netherlands) nl-NL Korean (South Korea) ko-KR Mandarin Chinese (China)* cmn-CN Polish (Poland) pl-PL Russian (Russia) ru-RU Thai (Thailand) th-TH Languages marked with an asterisk (*) are not available for Native audio . What's next Read the Tool Use and Session Management guides for essential information on using the Live API effectively. Try the Live API in Google AI Studio . For more info about the Live API models, see Gemini 2.0 Flash Live and Gemini 2.5 Flash Native Audio on the Models page. Try more examples in the Live API cookbook , the Live API Tools cookbook , and the Live API Get Started script . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/9e1911ad-8959-4b4d-ae70-78e257c9e4e3 b/docstore/9e1911ad-8959-4b4d-ae70-78e257c9e4e3 new file mode 100644 index 0000000000000000000000000000000000000000..4b6418baecebd23eec6598a4eb723dc1516263bd --- /dev/null +++ b/docstore/9e1911ad-8959-4b4d-ae70-78e257c9e4e3 @@ -0,0 +1 @@ +default media resolution or 6 hours long at low media resolution, while models with a 1M context window can process videos up to 1 hour long at default media resolution or 3 hours long at low media resolution. File API processing : When using the File API, videos are sampled at 1 frame per second (FPS) and audio is processed at 1Kbps (single channel). Timestamps are added every second. These rates are subject to change in the future for improvements in inference. Token calculation : Each second of video is tokenized as follows: Individual frames (sampled at 1 FPS): If mediaResolution is set to low, frames are tokenized at 66 tokens per frame. Otherwise, frames are tokenized at 258 tokens per frame. Audio: 32 tokens per second. Metadata is also included. Total: Approximately 300 tokens per second of video at default media resolution, or 100 tokens per second of video at low media resolution. Timestamp format : When referring to specific moments in a video within your prompt, use the MM:SS format (e.g., 01:15 for 1 minute and 15 seconds). Best practices : Use only one video per prompt request for optimal results. If combining text and a single video, place the text prompt after the video part in the contents array. Be aware that fast action sequences might lose detail due to the 1 FPS sampling rate. Consider slowing down such clips if necessary. What's next This guide shows how to upload video files and generate text outputs from video inputs. To learn more, see the following resources: System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. Files API : Learn more about uploading and managing files for use with Gemini. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. \ No newline at end of file diff --git a/docstore/9e24d1f0-c3a3-429e-967b-ee9ff2b6b23a b/docstore/9e24d1f0-c3a3-429e-967b-ee9ff2b6b23a new file mode 100644 index 0000000000000000000000000000000000000000..8d1e7e8a7a566917639b9515f69a698a37f27a8d --- /dev/null +++ b/docstore/9e24d1f0-c3a3-429e-967b-ee9ff2b6b23a @@ -0,0 +1 @@ +to avoid. ⛔ Negative pattern: Example Don't end haikus with a question: Haiku are fun A short and simple poem Don't you enjoy them? ✅ Positive pattern: Example Always end haikus with an assertion: Haiku are fun A short and simple poem A joy to write Consistent formatting Make sure that the structure and formatting of few-shot examples are the same to avoid responses with undesired formats. One of the primary objectives of adding few-shot examples in prompts is to show the model the response format. Therefore, it is essential to ensure a consistent format across all examples, especially paying attention to XML tags, white spaces, newlines, and example splitters. Add context You can include instructions and information in a prompt that the model needs to solve a problem, instead of assuming that the model has all of the required information. This contextual information helps the model understand the constraints and details of what you're asking for it to do. The following example asks the model to give troubleshooting guidance for a router: Prompt: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Response: A yellow blinking light on your Google Wifi router usually indicates a problem with the internet connection. Here are some steps to troubleshoot and fix the issue: 1. Restart your router and modem: - Unplug your modem from the power outlet and wait for 30 seconds. - Plug your modem back in and wait for it to fully boot up. - Unplug your Google Wifi router and wait for 30 seconds. - Plug your Google Wifi router back in and wait for it to fully boot up. ..... (gemini-2.5-flash) The response looks like generic troubleshooting information that's not specific to the router or the status of the LED indicator lights. To customize the response for the specific router, you can add to the prompt the router's troubleshooting guide as context for it to refer to when providing a response. Prompt: Answer the question \ No newline at end of file diff --git a/docstore/9e29ad4f-5a7d-4886-9dd9-609d1ddbdaca b/docstore/9e29ad4f-5a7d-4886-9dd9-609d1ddbdaca new file mode 100644 index 0000000000000000000000000000000000000000..7e27e2809a28a31c9d496a7e3606c1969c7dca44 --- /dev/null +++ b/docstore/9e29ad4f-5a7d-4886-9dd9-609d1ddbdaca @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/text-generation?lang=python#system-instructions Title: Text generation | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/9e787a39-3ab7-4cc6-96dd-408b2dd03e0d b/docstore/9e787a39-3ab7-4cc6-96dd-408b2dd03e0d new file mode 100644 index 0000000000000000000000000000000000000000..be865665baa597a5b341e658abc6f47e616f09e1 --- /dev/null +++ b/docstore/9e787a39-3ab7-4cc6-96dd-408b2dd03e0d @@ -0,0 +1 @@ +OpenAI compatibility | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback OpenAI compatibility Gemini models are accessible using the OpenAI libraries (Python and TypeScript / Javascript) along with the REST API, by updating three lines of code and using your Gemini API key . If you aren't already using the OpenAI libraries, we recommend that you call the Gemini API directly . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.5-flash" , messages = [ { "role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Explain to me how AI works" } ] ) print ( response . choices [ 0 ] . message ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); const response = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : [ { role : "system" , content : "You are a helpful assistant." }, { role : "user" , content : "Explain to me how AI works" , }, ], }); console . log ( response . choices [ 0 ]. message ); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.0-flash", "messages": [ {"role": "user", "content": "Explain to me how AI works"} ] }' What changed? Just three lines! api_key="GEMINI_API_KEY" : Replace " GEMINI_API_KEY " with your actual Gemini API key, \ No newline at end of file diff --git a/docstore/9e797ab4-c4ce-4f0d-88de-ae181c824eb7 b/docstore/9e797ab4-c4ce-4f0d-88de-ae181c824eb7 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/9e797ab4-c4ce-4f0d-88de-ae181c824eb7 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/9e7ddd2f-1f5f-424a-ae12-8eef6bc4fd53 b/docstore/9e7ddd2f-1f5f-424a-ae12-8eef6bc4fd53 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/9e7ddd2f-1f5f-424a-ae12-8eef6bc4fd53 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/9e832075-1ddb-4b29-bfb5-00ead2c340e7 b/docstore/9e832075-1ddb-4b29-bfb5-00ead2c340e7 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/9e832075-1ddb-4b29-bfb5-00ead2c340e7 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/9e8a6920-2363-44b0-bcdd-f7641555e21d b/docstore/9e8a6920-2363-44b0-bcdd-f7641555e21d new file mode 100644 index 0000000000000000000000000000000000000000..01ae62c8740ecd40460af64187bff6feef8cdae5 --- /dev/null +++ b/docstore/9e8a6920-2363-44b0-bcdd-f7641555e21d @@ -0,0 +1 @@ +are not available in OpenAI models but can be enabled using the extra_body field. extra_body features safety_settings Corresponds to Gemini's SafetySetting . cached_content Corresponds to Gemini's GenerateContentRequest.cached_content . thinking_config Corresponds to Gemini's ThinkingConfig . cached_content Here's an example of using extra_body to set cached_content : Python from openai import OpenAI client = OpenAI ( api_key = MY_API_KEY , base_url = "https://generativelanguage.googleapis.com/v1beta/" ) stream = client . chat . completions . create ( model = "gemini-2.5-pro" , n = 1 , messages = [ { "role" : "user" , "content" : "Summarize the video" } ], stream = True , stream_options = { 'include_usage' : True }, extra_body = { 'extra_body' : { 'google' : { 'cached_content' : "cachedContents/0000aaaa1111bbbb2222cccc3333dddd4444eeee" } } } ) for chunk in stream : print ( chunk ) print ( chunk . usage . to_dict ()) List models Get a list of available Gemini models: Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) models = client . models . list () for model in models : print ( model . id ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); async function main () { const list = await openai . models . list (); for await ( const model of list ) { console . log ( model ); } } main (); REST curl https://generativelanguage.googleapis.com/v1beta/openai/models \ -H "Authorization: Bearer GEMINI_API_KEY" Retrieve a model Retrieve a Gemini model: Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) model = client . models . retrieve ( "gemini-2.0-flash" ) print ( model . id ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : \ No newline at end of file diff --git a/docstore/9eaa6eb0-a61e-488b-9568-95b28c637860 b/docstore/9eaa6eb0-a61e-488b-9568-95b28c637860 new file mode 100644 index 0000000000000000000000000000000000000000..1540f812946831d7975c774be202eaa83cf52504 --- /dev/null +++ b/docstore/9eaa6eb0-a61e-488b-9568-95b28c637860 @@ -0,0 +1 @@ +"CiwBVKhc7nRyTi3HmggPD9iQiRc261f5jwuMdw3H/itDH0emsb9ZVo3Nwx9p6wpsAVSoXO5i8fDV4jBSBLoaWxB5zUdlGY6aIGp+I0oEnwRRSRQ1LOvrDlojEH8JE8HjiKXALdJrvNPiG+HY3GZEO8pZjEZtc3UoBUh7+SVyjK7Xolu7aRYYeUyzrCapoETWypER1jbrJXnFV23hCosBAVSoXO6oIPNJSmbuEDfGafOhuCSHkpr1yjTp35RXYqmCESzRzWf5+nFXLqncqeFo4ohoxbiYQVpVQbOZF81p8o9zg6xeRE7qMeOv+XN7enXGJ4/s3qNFQpfkSMqRdBITN1VpX7jyfEAjvxBNc7PDfDJZmEPY338ZIY5nFFcmzJSWjVrboFt2sMFv+A==" } ] , "role" : "model" } , "finishReason" : "STOP" , "index" : 0 } ] , # Remainder of response... You can confirm that you received a signature and see what a signature looks like using the following code: # Step 2: Call the model with function declarations # ...Generation config, Configure the client, and Define user prompt (No changes) # Send request with declarations (using a thinking model) response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents ) # See thought signatures for part in response . candidates [ 0 ] . content . parts : if part . thought_signature : print ( "Thought signature:" ) print ( part . thought_signature ) Returning signatures back to the server In order to return signatures back: You should return signatures along with their containing parts back to the server You shouldn't merge a part with a signature with another part which also contains a signature. The signature string is not concatenable You shouldn't merge one part with a signature with another part without a signature. This breaks the correct positioning of the thought represented by the signature. The code will remain the same as in Step 4 of the previous section. But in this case (as indicated in the comment below) you will return signatures to the model along with the result of the function execution so the model can incorporate the thoughts into its final response: Python # Step 4: Create user friendly response with function result and call the model again # ...Create a function response part (No change) # Append thought \ No newline at end of file diff --git a/docstore/9eb7e39e-f863-4b26-935d-2f831f1d1584 b/docstore/9eb7e39e-f863-4b26-935d-2f831f1d1584 new file mode 100644 index 0000000000000000000000000000000000000000..2e2abef829a364ac1b0e2e7d97629b4c7b341cf0 --- /dev/null +++ b/docstore/9eb7e39e-f863-4b26-935d-2f831f1d1584 @@ -0,0 +1 @@ +the TTL defaults to 1 hour. The cost for caching depends on the input token size and how long you want the tokens to persist. This section assumes that you've installed a Gemini SDK (or have curl installed) and that you've configured an API key, as shown in the quickstart . Explicit caching using the OpenAI library If you're using an OpenAI library , you can enable explicit caching using the cached_content property on extra_body . When to use explicit caching Context caching is particularly well suited to scenarios where a substantial initial context is referenced repeatedly by shorter requests. Consider using context caching for use cases such as: Chatbots with extensive system instructions Repetitive analysis of lengthy video files Recurring queries against large document sets Frequent code repository analysis or bug fixing How explicit caching reduces costs Context caching is a paid feature designed to reduce overall operational costs. Billing is based on the following factors: Cache token count: The number of input tokens cached, billed at a reduced rate when included in subsequent prompts. Storage duration: The amount of time cached tokens are stored (TTL), billed based on the TTL duration of cached token count. There are no minimum or maximum bounds on the TTL. Other factors: Other charges apply, such as for non-cached input tokens and output tokens. For up-to-date pricing details, refer to the Gemini API pricing page . To learn how to count tokens, see the Token guide . Additional considerations Keep the following considerations in mind when using context caching: The minimum input token count for context caching is 1,024 for 2.5 Flash and 2,048 for 2.5 Pro. The maximum is the same as the maximum for the given model. (For more on counting tokens, see the Token guide ). The model doesn't make any distinction between cached tokens and regular input tokens. Cached content is a prefix to the prompt. There are no special rate or usage limits on context caching; \ No newline at end of file diff --git a/docstore/9ebdf07d-dd6d-47df-ae85-b3d84109e022 b/docstore/9ebdf07d-dd6d-47df-ae85-b3d84109e022 new file mode 100644 index 0000000000000000000000000000000000000000..3d93e98c2388d03b02165eba8a61d6ed193d8415 --- /dev/null +++ b/docstore/9ebdf07d-dd6d-47df-ae85-b3d84109e022 @@ -0,0 +1 @@ +Prompt design strategies | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Prompt design strategies Prompt design is the process of creating prompts, or natural language requests, that elicit accurate, high quality responses from a language model. This page introduces basic concepts, strategies, and best practices to get you started designing prompts to get the most out of Gemini AI models. Topic-specific prompt guides Looking for more specific prompt strategies? Check out our other prompting guides on: Prompting with media files Prompting for image generation Prompting for video generation Google AI Studio also hosts a prompt gallery meant to interactively showcase many of the concepts shared in this guide. Clear and specific instructions An effective and efficient way to customize model behavior is to provide it with clear and specific instructions. Instructions can be in the form of a question, step-by-step tasks, or as complex as mapping out a user's experience and mindset. Input Input is the required text in the prompt that you want the model to provide a response to. Inputs can be a question that the model answers (question input), a task the model performs (task input), an entity the model operates on (entity input), or partial input that the model completes or continues (completion input). Input type Prompt Generated output Question What's a good name for a flower shop that specializes in selling bouquets of dried flowers? Create a list of 5 options with just the names. Here are 10 names for a flower shop specializing in dried flowers: 1. Everlasting Blooms 2. Dried & Delightful 3. The Petal Preserve 4. Whispers of Wildflowers \ No newline at end of file diff --git a/docstore/9ec7d2a2-0383-4e46-b6e6-7eda6d276d96 b/docstore/9ec7d2a2-0383-4e46-b6e6-7eda6d276d96 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/9ec7d2a2-0383-4e46-b6e6-7eda6d276d96 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/9ef94cc4-ea21-43c7-a421-167dfa81aaff b/docstore/9ef94cc4-ea21-43c7-a421-167dfa81aaff new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/9ef94cc4-ea21-43c7-a421-167dfa81aaff @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/9f08a60b-ed8b-4268-a9e1-baac8e50e4ec b/docstore/9f08a60b-ed8b-4268-a9e1-baac8e50e4ec new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/9f08a60b-ed8b-4268-a9e1-baac8e50e4ec @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/9f11bd73-2daf-41ad-87fe-7fb056e0e289 b/docstore/9f11bd73-2daf-41ad-87fe-7fb056e0e289 new file mode 100644 index 0000000000000000000000000000000000000000..a2eb6effe018fcbb28eb80c102d9fbb9beed9d0c --- /dev/null +++ b/docstore/9f11bd73-2daf-41ad-87fe-7fb056e0e289 @@ -0,0 +1 @@ +50,000 lines of code (with the standard 80 characters per line) All the text messages you have sent in the last 5 years 8 average length English novels Transcripts of over 200 average length podcast episodes The more limited context windows common in many other models often require strategies like arbitrarily dropping old messages, summarizing content, using RAG with vector databases, or filtering prompts to save tokens. While these techniques remain valuable in specific scenarios, Gemini's extensive context window invites a more direct approach: providing all relevant information upfront. Because Gemini models were purpose-built with massive context capabilities, they demonstrate powerful in-context learning. For example, using only in-context instructional materials (a 500-page reference grammar, a dictionary, and ≈400 parallel sentences), Gemini learned to translate from English to Kalamang—a Papuan language with fewer than 200 speakers—with quality similar to a human learner using the same materials. This illustrates the paradigm shift enabled by Gemini's long context, empowering new possibilities through robust in-context learning. Long context use cases While the standard use case for most generative models is still text input, the Gemini model family enables a new paradigm of multimodal use cases. These models can natively understand text, video, audio, and images. They are accompanied by the Gemini API that takes in multimodal file types for convenience. Long form text Text has proved to be the layer of intelligence underpinning much of the momentum around LLMs. As mentioned earlier, much of the practical limitation of LLMs was because of not having a large enough context window to do certain tasks. This led to the rapid adoption of retrieval augmented generation (RAG) and other techniques which dynamically provide the model with relevant contextual information. Now, with larger and larger context windows, there are new techniques becoming available which \ No newline at end of file diff --git a/docstore/9f1323e5-3cfd-41f9-bacf-e3b2fb725ba7 b/docstore/9f1323e5-3cfd-41f9-bacf-e3b2fb725ba7 new file mode 100644 index 0000000000000000000000000000000000000000..c839e4b299fa83f191461c51a3897f429d1b3fab --- /dev/null +++ b/docstore/9f1323e5-3cfd-41f9-bacf-e3b2fb725ba7 @@ -0,0 +1 @@ +Response: {'status': 'success'}" ); return { status : "success" }; } const toolFunctions = { get_weather_forecast , set_thermostat_temperature , }; const tools = [ { functionDeclarations : [ { name : "get_weather_forecast" , description : "Gets the current weather temperature for a given location." , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , }, }, required : [ "location" ], }, }, { name : "set_thermostat_temperature" , description : "Sets the thermostat to a desired temperature." , parameters : { type : Type . OBJECT , properties : { temperature : { type : Type . NUMBER , }, }, required : [ "temperature" ], }, }, ], }, ]; // Prompt for the model let contents = [ { role : "user" , parts : [ { text : "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , }, ], }, ]; // Loop until the model has no more function calls to make while ( true ) { const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents , config : { tools }, }); if ( result . functionCalls && result . functionCalls . length > 0 ) { const functionCall = result . functionCalls [ 0 ]; const { name , args } = functionCall ; if ( ! toolFunctions [ name ]) { throw new Error ( `Unknown function call: ${ name } ` ); } // Call the function and get the response. const toolResponse = toolFunctions [ name ]( args ); const functionResponsePart = { name : functionCall . name , response : { result : toolResponse , }, }; // Send the function response back to the model. contents . push ({ role : "model" , parts : [ { functionCall : functionCall , }, ], }); contents . push ({ role : "user" , parts : [ { functionResponse : functionResponsePart , }, ], }); } else { // No more function calls, break the loop. console . log ( result . text ); break ; } } Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the \ No newline at end of file diff --git a/docstore/9f1d18a7-d6d0-4e57-9b98-1eacb3f64541 b/docstore/9f1d18a7-d6d0-4e57-9b98-1eacb3f64541 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/9f1d18a7-d6d0-4e57-9b98-1eacb3f64541 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/9f330151-2204-48a0-9abb-937cfbc8c074 b/docstore/9f330151-2204-48a0-9abb-937cfbc8c074 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/9f330151-2204-48a0-9abb-937cfbc8c074 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/9f509413-66bf-4906-ba48-d51a2c89831c b/docstore/9f509413-66bf-4906-ba48-d51a2c89831c new file mode 100644 index 0000000000000000000000000000000000000000..af04e93a60e88c829b3aa07a575f51b8b1fa9ccf --- /dev/null +++ b/docstore/9f509413-66bf-4906-ba48-d51a2c89831c @@ -0,0 +1 @@ +TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : false , // default startOfSpeechSensitivity : StartSensitivity . START_SENSITIVITY_LOW , endOfSpeechSensitivity : EndSensitivity . END_SENSITIVITY_LOW , prefixPaddingMs : 20 , silenceDurationMs : 100 , } } }; Disable automatic VAD Alternatively, the automatic VAD can be disabled by setting realtimeInputConfig.automaticActivityDetection.disabled to true in the setup message. In this configuration the client is responsible for detecting user speech and sending activityStart and activityEnd messages at the appropriate times. An audioStreamEnd isn't sent in this configuration. Instead, any interruption of the stream is marked by an activityEnd message. Python config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : True }}, } async with client . aio . live . connect ( model = model , config = config ) as session : # ... await session . send_realtime_input ( activity_start = types . ActivityStart ()) await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) await session . send_realtime_input ( activity_end = types . ActivityEnd ()) # ... JavaScript const config = { responseModalities : [ Modality . TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : true , } } }; session . sendRealtimeInput ({ activityStart : {} }) session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); session . sendRealtimeInput ({ activityEnd : {} }) Token count You can find the total number of consumed tokens in the usageMetadata field of the returned server message. Python async for message in session . receive (): # The server will periodically send messages that include UsageMetadata. if message . usage_metadata : usage = message . usage_metadata print ( f "Used { usage . total_token_count } tokens in total. Response token \ No newline at end of file diff --git a/docstore/9f82ee6b-66e4-4013-937d-53fae81c26de b/docstore/9f82ee6b-66e4-4013-937d-53fae81c26de new file mode 100644 index 0000000000000000000000000000000000000000..1b38806cd93f51329872622740eee5221d7a7d7e --- /dev/null +++ b/docstore/9f82ee6b-66e4-4013-937d-53fae81c26de @@ -0,0 +1 @@ +18°C." , config = config , ) # Print the final, user-facing response print ( response . text ) Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . I 've set the thermostat to 20°C. JavaScript This example shows how to use JavaScript/TypeScript SDK to do comopositional function calling using a manual execution loop. import { GoogleGenAI , Type } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Example Functions function get_weather_forecast ({ location }) { console . log ( `Tool Call: get_weather_forecast(location= ${ location } )` ); // TODO: Make API call console . log ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ); return { temperature : 25 , unit : "celsius" }; } function set_thermostat_temperature ({ temperature }) { console . log ( `Tool Call: set_thermostat_temperature(temperature= ${ temperature } )` , ); // TODO: Make API call console . log ( "Tool Response: {'status': 'success'}" ); return { status : "success" }; } const toolFunctions = { get_weather_forecast , set_thermostat_temperature , }; const tools = [ { functionDeclarations : [ { name : "get_weather_forecast" , description : "Gets the current weather temperature for a given location." , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , }, }, required : [ "location" ], }, }, { name : "set_thermostat_temperature" , description : "Sets the thermostat to a desired temperature." , parameters : { type : Type . OBJECT , properties : { temperature : { type : Type . NUMBER , }, }, required : [ \ No newline at end of file diff --git a/docstore/9f879c25-c898-4111-b6a4-49cc0af18692 b/docstore/9f879c25-c898-4111-b6a4-49cc0af18692 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/9f879c25-c898-4111-b6a4-49cc0af18692 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/9f8e70b3-17f2-43f7-b6fe-3e1d98717a1e b/docstore/9f8e70b3-17f2-43f7-b6fe-3e1d98717a1e new file mode 100644 index 0000000000000000000000000000000000000000..2437f77cb02a7dfc3b66d950f0fe4ad8777ea66f --- /dev/null +++ b/docstore/9f8e70b3-17f2-43f7-b6fe-3e1d98717a1e @@ -0,0 +1 @@ +SpeakerVoiceConfig ( speaker = 'Joe' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), types . SpeakerVoiceConfig ( speaker = 'Jane' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Puck' , ) ) ), ] ) ) ) ) data = response . candidates [ 0 ] . content . parts [ 0 ] . inline_data . data file_name = 'out.wav' wave_file ( file_name , data ) # Saves the file to current directory JavaScript import { GoogleGenAI } from '@google/genai' ; import wav from 'wav' ; async function saveWaveFile ( filename , pcmData , channels = 1 , rate = 24000 , sampleWidth = 2 , ) { return new Promise (( resolve , reject ) = > { const writer = new wav . FileWriter ( filename , { channels , sampleRate : rate , bitDepth : sampleWidth * 8 , }); writer . on ( 'finish' , resolve ); writer . on ( 'error' , reject ); writer . write ( pcmData ); writer . end (); }); } async function main () { const ai = new GoogleGenAI ({}); const prompt = `TTS the following conversation between Joe and Jane: Joe: How's it going today Jane? Jane: Not too bad, how about you?` ; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : [{ parts : [{ text : prompt }] }], config : { responseModalities : [ 'AUDIO' ], speechConfig : { multiSpeakerVoiceConfig : { speakerVoiceConfigs : [ { speaker : 'Joe' , voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Kore' } } }, { speaker : 'Jane' , voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Puck' } } } ] } } } }); const data = response . candidates ? .[ 0 ] ? . content ? . parts ? .[ 0 ] ? . inlineData ? . data ; const audioBuffer = Buffer . from ( data , 'base64' ); const fileName = 'out.wav' ; await saveWaveFile ( fileName , audioBuffer ); } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \ -H "x-goog-api-key: \ No newline at end of file diff --git a/docstore/9f9a4989-3349-4df9-8c83-d05f8f85605c b/docstore/9f9a4989-3349-4df9-8c83-d05f8f85605c new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/9f9a4989-3349-4df9-8c83-d05f8f85605c @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/9fa608e7-2c0a-4a7d-a107-656a284f081b b/docstore/9fa608e7-2c0a-4a7d-a107-656a284f081b new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/9fa608e7-2c0a-4a7d-a107-656a284f081b @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/9fac821d-7d3b-4763-98e1-0b9d55b00f61 b/docstore/9fac821d-7d3b-4763-98e1-0b9d55b00f61 new file mode 100644 index 0000000000000000000000000000000000000000..987408a9d5fce99ac4990dd87d63b0d860b9553d --- /dev/null +++ b/docstore/9fac821d-7d3b-4763-98e1-0b9d55b00f61 @@ -0,0 +1 @@ +overwhelming dominance of digital streaming and on-demand services; a "smart" approach would therefore require a highly specialized niche, a strong value proposition beyond just the disc, and likely a focus on community or collector markets rather than mainstream consumption. Here's a breakdown of what a smart DVD business in 2025 would likely entail: 1. Extreme Niche Specialization: You cannot compete with streaming libraries. Your business must focus on what digital doesn't offer easily or at all. This could include: * Out-of-Print & Rare Titles: Sourcing films or TV shows that have never been released digitally or are currently unavailable on major streaming platforms. .... (gemini-2.5-flash) Format responses with the completion strategy The completion strategy can also help format the response. The following example prompts the model to create an essay outline: Prompt: Create an outline for an essay about hummingbirds. Response: Here is a possible outline for an essay about hummingbirds: I. Introduction A. Hook: Start with a captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). B. Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds. C. Thesis Statement: State the main point of the essay, which could be about the remarkable adaptations, behaviors, and ecological importance of hummingbirds. II. Appearance and Identification ..... (gemini-2.5-flash) The prompt didn't specify the format for the outline and the model chose a format for you. To get the model to return an outline in a specific format, you can add text that represents the start of the outline and let the model complete it based on the pattern that you initiated. Prompt: Create an outline for an essay about hummingbirds. I. Introduction * Response: Okay, here is an outline for an essay about hummingbirds, starting from the introduction you provided: I. Introduction * Hook: Start with a \ No newline at end of file diff --git a/docstore/9fc1b04a-f1ee-4c34-99fb-3e4bacc16c32 b/docstore/9fc1b04a-f1ee-4c34-99fb-3e4bacc16c32 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/9fc1b04a-f1ee-4c34-99fb-3e4bacc16c32 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/9fdf953e-1b16-41b9-a98a-6f10eace80ed b/docstore/9fdf953e-1b16-41b9-a98a-6f10eace80ed new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/9fdf953e-1b16-41b9-a98a-6f10eace80ed @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/9fe46a28-11a5-45ce-81be-20451bb757a2 b/docstore/9fe46a28-11a5-45ce-81be-20451bb757a2 new file mode 100644 index 0000000000000000000000000000000000000000..e6f71e62f1d38e5969349ef563bd1d1143e3d3e1 --- /dev/null +++ b/docstore/9fe46a28-11a5-45ce-81be-20451bb757a2 @@ -0,0 +1 @@ +shows you how to specify ambiance. Ambiance Prompt Generated output Color palettes play a vital role in photography, influencing the mood and conveying intended emotions. Try things like "muted orange warm tones," "natural light," "sunrise" or "sunset". For example, a warm, golden palette can infuse a romantic and atmospheric feel into a photograph. A close-up of a girl holding adorable golden retriever puppy in the park, sunlight. Cinematic close-up shot of a sad woman riding a bus in the rain, cool blue tones, sad mood. Use reference images to generate videos You can bring images to life by using Veo's image-to-video capability. You can use existing assets, or try Imagen to generate something new. Prompt Generated output Bunny with a chocolate candy bar. Bunny runs away. Negative prompts Negative prompts can be a powerful tool to help specify elements you don't want in the video. Describe what you want to discourage the model from generating after the phrase "Negative prompt". Follow these tips: ❌ Don't use instructive language or words like no or don't . For example, "No walls" or "don't show walls". ✅ Do describe what you don't want to see. For example, "wall, frame", which means that you don't want a wall or a frame in the video. Prompt Generated output Generate a short, stylized animation of a large, solitary oak tree with leaves blowing vigorously in a strong wind. The tree should have a slightly exaggerated, whimsical form, with dynamic, flowing branches. The leaves should display a variety of autumn colors, swirling and dancing in the wind. The animation should use a warm, inviting color palette. Generate a short, stylized animation of a large, solitary oak tree with leaves blowing vigorously in a strong wind. The tree should have a slightly exaggerated, whimsical form, with dynamic, flowing branches. The leaves should display a variety of autumn colors, swirling and dancing in the wind. The animation should use a warm, inviting color palette. With negative \ No newline at end of file diff --git a/docstore/a001d99a-c7b6-4afb-a4a0-eda66470631a b/docstore/a001d99a-c7b6-4afb-a4a0-eda66470631a new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/a001d99a-c7b6-4afb-a4a0-eda66470631a @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/a004910e-67c3-43c6-931d-166bccfdb82e b/docstore/a004910e-67c3-43c6-931d-166bccfdb82e new file mode 100644 index 0000000000000000000000000000000000000000..404efdddaf46864c9f225aa83233d4809952ce81 --- /dev/null +++ b/docstore/a004910e-67c3-43c6-931d-166bccfdb82e @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/safety-guidance#main-content Title: Safety guidance | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/a006ca79-44bd-42dd-895f-b27684f1d2aa b/docstore/a006ca79-44bd-42dd-895f-b27684f1d2aa new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/a006ca79-44bd-42dd-895f-b27684f1d2aa @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/a00b4a22-9df4-4877-a5ef-dab26775e7ca b/docstore/a00b4a22-9df4-4877-a5ef-dab26775e7ca new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/a00b4a22-9df4-4877-a5ef-dab26775e7ca @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/a0263f5f-b3fb-47cf-ab94-5ef133179c35 b/docstore/a0263f5f-b3fb-47cf-ab94-5ef133179c35 new file mode 100644 index 0000000000000000000000000000000000000000..0cca4069bc044eb5ec6c4ebd1c70dec7e7b06ace --- /dev/null +++ b/docstore/a0263f5f-b3fb-47cf-ab94-5ef133179c35 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/thinking#supported-models Title: Gemini thinking | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/a036f86d-5ac7-4da2-8dad-800edeca0f54 b/docstore/a036f86d-5ac7-4da2-8dad-800edeca0f54 new file mode 100644 index 0000000000000000000000000000000000000000..90311e83465fc930174e1787a7822757d628fc0a --- /dev/null +++ b/docstore/a036f86d-5ac7-4da2-8dad-800edeca0f54 @@ -0,0 +1 @@ +setLightValues ( tool_call . args . brightness , tool_call . args . color_temp ); console . log ( `Function execution result: ${ JSON . stringify ( result ) } ` ); } Step 4: Create user friendly response with function result and call the model again Finally, send the result of the function execution back to the model so it can incorporate this information into its final response to the user. Python # Create a function response part function_response_part = types . Part . from_function_response ( name = tool_call . name , response = { "result" : result }, ) # Append function call and result of the function execution to contents contents . append ( response . candidates [ 0 ] . content ) # Append the content from the model's response. contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Create a function response part const function_response_part = { name : tool_call . name , response : { result } } // Append function call and result of the function execution to contents contents . push ( response . candidates [ 0 ]. content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); // Get the final response from the model const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); This completes the function calling flow. The model successfully used the set_light_values function to perform the request action of the user. Function declarations When you implement function calling in a prompt, you create a tools object, which contains one or more function declarations . You define functions using JSON, specifically with a select subset of the OpenAPI schema format. A single function \ No newline at end of file diff --git a/docstore/a03b4bbf-3b70-4183-9605-05c5bddb1d25 b/docstore/a03b4bbf-3b70-4183-9605-05c5bddb1d25 new file mode 100644 index 0000000000000000000000000000000000000000..846f589921f766089772715bc1a3853935a191ce --- /dev/null +++ b/docstore/a03b4bbf-3b70-4183-9605-05c5bddb1d25 @@ -0,0 +1 @@ +batch_status.json ) if [[ $batch_state = "JOB_STATE_SUCCEEDED" ]] ; then if [[ $( jq '.response | has("inlinedResponses")' batch_status.json ) = "true" ]] ; then jq -r '.response.inlinedResponses' batch_status.json exit fi responses_file_name = $( jq -r '.response.responsesFile' batch_status.json ) curl https://generativelanguage.googleapis.com/download/v1beta/ $responses_file_name :download?alt = media \ -H "x-goog-api-key: $GEMINI_API_KEY " 2 > /dev/null elif [[ $batch_state = "JOB_STATE_FAILED" ]] ; then jq '.error' batch_status.json elif [[ $batch_state == "JOB_STATE_CANCELLED" ]] ; then echo "Batch was cancelled by the user" fi Cancelling a batch job You can cancel an ongoing batch job using its name. When a job is canceled, it stops processing new requests. Python # Cancel a batch job client . batches . cancel ( name = batch_job_to_cancel . name ) REST BATCH_NAME = "batches/123456" # Your batch job name # Cancel the batch curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME :cancel \ -H "x-goog-api-key: $GEMINI_API_KEY " \ # Confirm that the status of the batch after cancellation is JOB_STATE_CANCELLED curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type:application/json" 2 > /dev/null | jq -r '.metadata.state' Deleting a batch job You can delete an existing batch job using its name. When a job is deleted, it stops processing new requests and is removed from the list of batch jobs. Python # Delete a batch job client . batches . delete ( name = batch_job_to_delete . name ) REST BATCH_NAME = "batches/123456" # Your batch job name # Cancel the batch curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME :delete \ -H "x-goog-api-key: $GEMINI_API_KEY " \ Technical details Supported models: Batch Mode supports a range of Gemini models. Refer to the Models page for the latest list of compatible models. The supported modalities for Batch Mode are the same as what's \ No newline at end of file diff --git a/docstore/a0435c42-0a98-45dd-a838-50cb61932a53 b/docstore/a0435c42-0a98-45dd-a838-50cb61932a53 new file mode 100644 index 0000000000000000000000000000000000000000..01ae62c8740ecd40460af64187bff6feef8cdae5 --- /dev/null +++ b/docstore/a0435c42-0a98-45dd-a838-50cb61932a53 @@ -0,0 +1 @@ +are not available in OpenAI models but can be enabled using the extra_body field. extra_body features safety_settings Corresponds to Gemini's SafetySetting . cached_content Corresponds to Gemini's GenerateContentRequest.cached_content . thinking_config Corresponds to Gemini's ThinkingConfig . cached_content Here's an example of using extra_body to set cached_content : Python from openai import OpenAI client = OpenAI ( api_key = MY_API_KEY , base_url = "https://generativelanguage.googleapis.com/v1beta/" ) stream = client . chat . completions . create ( model = "gemini-2.5-pro" , n = 1 , messages = [ { "role" : "user" , "content" : "Summarize the video" } ], stream = True , stream_options = { 'include_usage' : True }, extra_body = { 'extra_body' : { 'google' : { 'cached_content' : "cachedContents/0000aaaa1111bbbb2222cccc3333dddd4444eeee" } } } ) for chunk in stream : print ( chunk ) print ( chunk . usage . to_dict ()) List models Get a list of available Gemini models: Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) models = client . models . list () for model in models : print ( model . id ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); async function main () { const list = await openai . models . list (); for await ( const model of list ) { console . log ( model ); } } main (); REST curl https://generativelanguage.googleapis.com/v1beta/openai/models \ -H "Authorization: Bearer GEMINI_API_KEY" Retrieve a model Retrieve a Gemini model: Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) model = client . models . retrieve ( "gemini-2.0-flash" ) print ( model . id ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : \ No newline at end of file diff --git a/docstore/a04571c9-420c-4e12-9434-44e2ce8702fe b/docstore/a04571c9-420c-4e12-9434-44e2ce8702fe new file mode 100644 index 0000000000000000000000000000000000000000..63d9f8f1c5d6f296f9334ea760bebfcc6dc4a24c --- /dev/null +++ b/docstore/a04571c9-420c-4e12-9434-44e2ce8702fe @@ -0,0 +1 @@ +Gemini thinking | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini thinking The Gemini 2.5 series models use an internal "thinking process" that significantly improves their reasoning and multi-step planning abilities, making them highly effective for complex tasks such as coding, advanced mathematics, and data analysis. This guide shows you how to work with Gemini's thinking capabilities using the Gemini API. Before you begin Ensure you use a supported 2.5 series model for thinking. You might find it beneficial to explore these models in AI Studio before diving into the API: Try Gemini 2.5 Flash in AI Studio Try Gemini 2.5 Pro in AI Studio Try Gemini 2.5 Flash-Lite Preview in AI Studio Generating content with thinking Initiating a request with a thinking model is similar to any other content generation request. The key difference lies in specifying one of the models with thinking support in the model field, as demonstrated in the following text generation example: Python from google import genai client = genai . Client () prompt = "Explain the concept of Occam's Razor and provide a simple, everyday example." response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = prompt ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const prompt = "Explain the concept of Occam's Razor and provide a simple, everyday example." ; const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : prompt , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" \ No newline at end of file diff --git a/docstore/a0474918-1e59-4588-b276-935fce149cdc b/docstore/a0474918-1e59-4588-b276-935fce149cdc new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/a0474918-1e59-4588-b276-935fce149cdc @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/a050f43d-a082-4dc2-8245-735c0166f42c b/docstore/a050f43d-a082-4dc2-8245-735c0166f42c new file mode 100644 index 0000000000000000000000000000000000000000..0b3b6fc570d30658769d9588c5c037674689b94a --- /dev/null +++ b/docstore/a050f43d-a082-4dc2-8245-735c0166f42c @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/thinking#set-budget Title: Gemini thinking | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/a06702c9-90d5-45bd-832b-260badc03cd2 b/docstore/a06702c9-90d5-45bd-832b-260badc03cd2 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/a06702c9-90d5-45bd-832b-260badc03cd2 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/a06d4b88-d88c-49e7-a0ad-092e0c7cb904 b/docstore/a06d4b88-d88c-49e7-a0ad-092e0c7cb904 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/a06d4b88-d88c-49e7-a0ad-092e0c7cb904 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/a0765a12-b3ef-4522-babc-5bf280072337 b/docstore/a0765a12-b3ef-4522-babc-5bf280072337 new file mode 100644 index 0000000000000000000000000000000000000000..38b0d511edf99b6e46520aefe0a8854155d4053f --- /dev/null +++ b/docstore/a0765a12-b3ef-4522-babc-5bf280072337 @@ -0,0 +1 @@ +energetic music, and dimmed the lights to 50% brightness. Let's get this party started! Compositional function calling Compositional or sequential function calling allows Gemini to chain multiple function calls together to fulfill a complex request. For example, to answer "Get the temperature in my current location", the Gemini API might first invoke a get_current_location() function followed by a get_weather() function that takes the location as a parameter. The following example demonstrates how to implement compositional function calling using the Python SDK and automatic function calling. Python This example uses the automatic function calling feature of the google-genai Python SDK. The SDK automatically converts the Python functions to the required schema, executes the function calls when requested by the model, and sends the results back to the model to complete the task. import os from google import genai from google.genai import types # Example Functions def get_weather_forecast ( location : str ) - > dict : """Gets the current weather temperature for a given location.""" print ( f "Tool Call: get_weather_forecast(location= { location } )" ) # TODO: Make API call print ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ) return { "temperature" : 25 , "unit" : "celsius" } # Dummy response def set_thermostat_temperature ( temperature : int ) - > dict : """Sets the thermostat to a desired temperature.""" print ( f "Tool Call: set_thermostat_temperature(temperature= { temperature } )" ) # TODO: Interact with a thermostat API print ( "Tool Response: {'status': 'success'}" ) return { "status" : "success" } # Configure the client and model client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_weather_forecast , set_thermostat_temperature ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to \ No newline at end of file diff --git a/docstore/a076973f-136d-4318-b9d6-691e0a2e516c b/docstore/a076973f-136d-4318-b9d6-691e0a2e516c new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/a076973f-136d-4318-b9d6-691e0a2e516c @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/a07bcf32-135c-4547-aa4a-1a7ee22eef4f b/docstore/a07bcf32-135c-4547-aa4a-1a7ee22eef4f new file mode 100644 index 0000000000000000000000000000000000000000..c3dd5ccab4f70d7a2a8a4ffb78dc41dee62c1a3f --- /dev/null +++ b/docstore/a07bcf32-135c-4547-aa4a-1a7ee22eef4f @@ -0,0 +1 @@ +with texts to make sense, use whatever order is most natural. Troubleshooting your multimodal prompt If the model is not drawing information from the relevant part of the image: Drop hints with which aspects of the image you want the prompt to draw information from. If the model output is too generic (not tailored enough to the image/video input): At the start of the prompt, try asking the model to describe the image(s) or video before providing the task instruction, or try asking the model to refer to what's in the image. To troubleshoot which part failed: Ask the model to describe the image, or ask the model to explain its reasoning, to gauge the model's initial understanding. If your prompt results in hallucinated content: Try dialing down the temperature setting or asking the model for shorter descriptions so that it's less likely to extrapolate additional details. Tuning the sampling parameters: Experiment with different temperature settings and top-k selections to adjust the model's creativity. Be specific in your instructions Prompts have the most success when they are clear and detailed. If you have a specific output in mind, it's better to include that requirement in the prompt to ensure you get the output you want. For this image of an airport board, asking the model to just "describe this image" could generate a general description. If you need the model to parse the time and city from the image, you can include that request directly in your prompt. Prompt Model response Describe this image. The image shows an airport arrivals and departures board. Updated prompt Improved response Parse the time and city from the airport board shown in this image into a list. 10:50 Moscow 11:05 Edinburgh 11:05 London 11:10 Bucharest 11:30 Kiev 11:35 Dublin 11:45 East Midlands 12:15 Sofia 12:30 London 12:30 Newcastle 12:40 St Petersburg 12:40 London 12:45 Manchester Add a few examples The Gemini model can accept multiple inputs which it can use as examples to understand \ No newline at end of file diff --git a/docstore/a0bdfa85-14b2-4350-8b3b-2465ec91cce7 b/docstore/a0bdfa85-14b2-4350-8b3b-2465ec91cce7 new file mode 100644 index 0000000000000000000000000000000000000000..8562c6ca5d2a89dac90935227121a5fd486f1f09 --- /dev/null +++ b/docstore/a0bdfa85-14b2-4350-8b3b-2465ec91cce7 @@ -0,0 +1 @@ +establishing your core idea, and then refine and expand upon that core idea until the generated image is close to your vision. Prompt: A park in the spring next to a lake Prompt: A park in the spring next to a lake, the sun sets across the lake, golden hour Prompt: A park in the spring next to a lake, the sun sets across the lake, golden hour, red wildflowers Imagen models can transform your ideas into detailed images, whether your prompts are short or long and detailed. Refine your vision through iterative prompting, adding details until you achieve the perfect result. Short prompts let you generate an image quickly. Prompt: close-up photo of a woman in her 20s, street photography, movie still, muted orange warm tones Longer prompts let you add specific details and build your image. Prompt: captivating photo of a woman in her 20s utilizing a street photography style. The image should look like a movie still with muted orange warm tones. Additional advice for Imagen prompt writing: Use descriptive language : Employ detailed adjectives and adverbs to paint a clear picture for Imagen. Provide context : If necessary, include background information to aid the AI's understanding. Reference specific artists or styles : If you have a particular aesthetic in mind, referencing specific artists or art movements can be helpful. Use prompt engineering tools : Consider exploring prompt engineering tools or resources to help you refine your prompts and achieve optimal results. Enhancing the facial details in your personal and group images : Specify facial details as a focus of the photo (for example, use the word "portrait" in the prompt). Generate text in images Imagen models can add text into images, opening up more creative image generation possibilities. Use the following guidance to get the most out of this feature: Iterate with confidence : You might have to regenerate images until you achieve the look you want. Imagen's text integration is still evolving, and sometimes \ No newline at end of file diff --git a/docstore/a0c5dd1d-2a85-41da-b908-669455f583a0 b/docstore/a0c5dd1d-2a85-41da-b908-669455f583a0 new file mode 100644 index 0000000000000000000000000000000000000000..4542d68c23b70ac5014267f60edbcffd2138877f --- /dev/null +++ b/docstore/a0c5dd1d-2a85-41da-b908-669455f583a0 @@ -0,0 +1 @@ +, headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Streaming can also be used for multi-turn conversations. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message_stream ( "I have 2 dogs in my house." ) for chunk in response : print ( chunk . text , end = "" ) response = chat . send_message_stream ( "How many paws are in my house?" ) for chunk in response : print ( chunk . text , end = "" ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const stream1 = await chat . sendMessageStream ({ message : "I have 2 dogs in my house." , }); for await ( const chunk of stream1 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } const stream2 = await chat . sendMessageStream ({ message : "How many paws are in my house?" , }); for await ( const chunk of stream2 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . \ No newline at end of file diff --git a/docstore/a0c90aa5-c67d-4ca9-9206-1219200df0d4 b/docstore/a0c90aa5-c67d-4ca9-9206-1219200df0d4 new file mode 100644 index 0000000000000000000000000000000000000000..c6d1a23ee99ea908e6095ac5d4be22719691ebac --- /dev/null +++ b/docstore/a0c90aa5-c67d-4ca9-9206-1219200df0d4 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/image-generation#main-content Title: Image generation | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/a0e26c42-56e2-4b20-825e-8122c3191bba b/docstore/a0e26c42-56e2-4b20-825e-8122c3191bba new file mode 100644 index 0000000000000000000000000000000000000000..48ce7760ed3b3e078bbb96293e0e67132c5a10c7 --- /dev/null +++ b/docstore/a0e26c42-56e2-4b20-825e-8122c3191bba @@ -0,0 +1 @@ +Video understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Video understanding Gemini models can process videos, enabling many frontier developer use cases that would have historically required domain specific models. Some of Gemini's vision capabilities include the ability to: Describe, segment, and extract information from videos Answer questions about video content Refer to specific timestamps within a video Gemini was built to be multimodal from the ground up and we continue to push the frontier of what is possible. This guide shows how to use the Gemini API to generate text responses based on video inputs. Video input You can provide videos as input to Gemini in the following ways: Upload a video file using the File API before making a request to generateContent . Use this method for files larger than 20MB, videos longer than approximately 1 minute, or when you want to reuse the file across multiple requests. Pass inline video data with the request to generateContent . Use this method for smaller files (<20MB) and shorter durations. Include a YouTube URL directly in the prompt. Upload a video file You can use the Files API to upload a video file. Always use the Files API when the total request size (including the file, text prompt, system instructions, etc.) is larger than 20 MB, the video duration is significant, or if you intend to use the same video in multiple prompts. The File API accepts video file formats directly. This example uses the short NASA film "Jupiter's Great Red Spot Shrinks and Grows" . Credit: Goddard Space Flight Center (GSFC)/David Ladd (2018). "Jupiter's Great Red Spot Shrinks and Grows" is in the \ No newline at end of file diff --git a/docstore/a0f33d11-12c1-48f7-8500-92f04a47e3fe b/docstore/a0f33d11-12c1-48f7-8500-92f04a47e3fe new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/a0f33d11-12c1-48f7-8500-92f04a47e3fe @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/a106a2e7-70c8-4c30-a7b5-273da91be44d b/docstore/a106a2e7-70c8-4c30-a7b5-273da91be44d new file mode 100644 index 0000000000000000000000000000000000000000..b61275a4d23d7f4554cf56c64eddf33f478c090b --- /dev/null +++ b/docstore/a106a2e7-70c8-4c30-a7b5-273da91be44d @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#rate-limits Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/a11fd676-0290-4e59-bd3d-6c49b0d02a13 b/docstore/a11fd676-0290-4e59-bd3d-6c49b0d02a13 new file mode 100644 index 0000000000000000000000000000000000000000..2dce4b1915975420243f156ab22de6a07e8b5cc9 --- /dev/null +++ b/docstore/a11fd676-0290-4e59-bd3d-6c49b0d02a13 @@ -0,0 +1 @@ +The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/a122c998-2792-4690-b4fc-8fa76533fa57 b/docstore/a122c998-2792-4690-b4fc-8fa76533fa57 new file mode 100644 index 0000000000000000000000000000000000000000..0d6f2bf32b918d3f85636fdb1c53070e3d5509f6 --- /dev/null +++ b/docstore/a122c998-2792-4690-b4fc-8fa76533fa57 @@ -0,0 +1 @@ +overlay_filename = f " { item [ 'label' ] } _ { i } _overlay.png" mask . save ( os . path . join ( output_dir , mask_filename )) # Create and save overlay composite = Image . alpha_composite ( im . convert ( 'RGBA' ), overlay ) composite . save ( os . path . join ( output_dir , overlay_filename )) print ( f "Saved mask and overlay for { item [ 'label' ] } to { output_dir } " ) # Example usage if __name__ == "__main__" : extract_segmentation_masks ( "path/to/image.png" ) Check the segmentation example in the cookbook guide for a more detailed example. An example segmentation output with objects and segmentation masks Supported image formats Gemini supports the following image format MIME types: PNG - image/png JPEG - image/jpeg WEBP - image/webp HEIC - image/heic HEIF - image/heif Capabilities All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation. Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements. Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities: Gemini 2.0 models are further trained to support enhanced object detection . Gemini 2.5 models are further trained to support enhanced segmentation in addition to object detection . Limitations and key technical information File limit Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request. Token calculation Gemini 1.5 Flash and Gemini 1.5 Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens. Gemini 2.0 Flash and Gemini 2.5 Flash/Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each \ No newline at end of file diff --git a/docstore/a161294e-471a-4561-a2bb-a970cabdb317 b/docstore/a161294e-471a-4561-a2bb-a970cabdb317 new file mode 100644 index 0000000000000000000000000000000000000000..9bdceff10febb2c20e655c9c51c8df9e3aef858d --- /dev/null +++ b/docstore/a161294e-471a-4561-a2bb-a970cabdb317 @@ -0,0 +1 @@ +Using different words or phrasing in your prompts often yields different responses from the model even though they all mean the same thing. If you're not getting the expected results from your prompt, try rephrasing it. Version 1: How do I bake a pie? Version 2: Suggest a recipe for a pie. Version 3: What's a good pie recipe? Switch to an analogous task: If you can't get the model to follow your instructions for a task, try giving it instructions for an analogous task that achieves the same result. This prompt tells the model to categorize a book by using predefined categories: Prompt: Which category does The Odyssey belong to: thriller sci-fi mythology biography Response: The Odyssey belongs to the category of **mythology**. Here's why: * **Mythology:** The Odyssey tells the story of Odysseus, a hero from Greek mythology, and his journey home after the Trojan War. It features gods, monsters, and supernatural events common to Greek mythology. ..... (gemini-2.5-flash) The response is correct, but the model didn't stay within the bounds of the options. You also want to model to just respond with one of the options instead of in a full sentence. In this case, you can rephrase the instructions as a multiple choice question and ask the model to choose an option. Prompt: Multiple choice problem: Which of the following options describes the book The Odyssey? Options: thriller sci-fi mythology biography Response: The correct answer is mythology . (gemini-2.5-flash) Change the order of prompt content: The order of the content in the prompt can sometimes affect the response. Try changing the content order and see how that affects the response. Version 1 : [ examples ] [ context ] [ input ] Version 2 : [ input ] [ examples ] [ context ] Version 3 : [ examples ] [ input ] [ context ] Fallback responses A fallback response is a response returned by the model when either the prompt or the response triggers a safety filter. An example of a fallback response is "I'm not able to \ No newline at end of file diff --git a/docstore/a17d4e3a-e9ac-42bc-9571-4de5eb5fb0d6 b/docstore/a17d4e3a-e9ac-42bc-9571-4de5eb5fb0d6 new file mode 100644 index 0000000000000000000000000000000000000000..dfeae8fcf584330ed11cdd48e07105d5f4f56b31 --- /dev/null +++ b/docstore/a17d4e3a-e9ac-42bc-9571-4de5eb5fb0d6 @@ -0,0 +1 @@ +retrieval_tool ] ) response = client . models . generate_content ( model = 'gemini-1.5-flash' , contents = "Who won the euro 2024?" , config = config , ) print ( response . text ) if not response . candidates [ 0 ] . grounding_metadata : print ( " \n Model answered from its own knowledge." ) JavaScript // Note: This is a legacy approach for Gemini 1.5 models. // The 'googleSearch' tool is recommended for all new development. import { GoogleGenAI , DynamicRetrievalConfigMode } from "@google/genai" ; const ai = new GoogleGenAI ({}); const retrievalTool = { googleSearchRetrieval : { dynamicRetrievalConfig : { mode : DynamicRetrievalConfigMode . MODE_DYNAMIC , dynamicThreshold : 0.7 , // Only search if confidence > 70% }, }, }; const config = { tools : [ retrievalTool ], }; const response = await ai . models . generateContent ({ model : "gemini-1.5-flash" , contents : "Who won the euro 2024?" , config , }); console . log ( response . text ); if ( ! response . candidates ? .[ 0 ] ? . groundingMetadata ) { console . log ( "\nModel answered from its own knowledge." ); } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -X POST \ -d '{ "contents": [ {"parts": [{"text": "Who won the euro 2024?"}]} ], "tools": [{ "google_search_retrieval": { "dynamic_retrieval_config": { "mode": "MODE_DYNAMIC", "dynamic_threshold": 0.7 } } }] }' What's next Try the Grounding with Google Search in the Gemini API Cookbook . Learn about other available tools, like Function Calling . Learn how to augment prompts with specific URLs using the URL context tool . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. \ No newline at end of file diff --git a/docstore/a1e72b96-5cea-4b4c-92bd-2ca6a1d0c417 b/docstore/a1e72b96-5cea-4b4c-92bd-2ca6a1d0c417 new file mode 100644 index 0000000000000000000000000000000000000000..facd4f50718f089582ed06a7b9e2ea144ce56d9c --- /dev/null +++ b/docstore/a1e72b96-5cea-4b4c-92bd-2ca6a1d0c417 @@ -0,0 +1 @@ +instructions, such as: "Show bounding boxes of all green objects in this image". It also support custom labels like "label the items with the allergens they can contain". For more examples, check following notebooks in the Gemini Cookbook : 2D spatial understanding notebook Experimental 3D pointing notebook Segmentation Starting with Gemini 2.5, models not only detect items but also segment them and provide their contour masks. The model predicts a JSON list, where each item represents a segmentation mask. Each item has a bounding box (" box_2d ") in the format [y0, x0, y1, x1] with normalized coordinates between 0 and 1000, a label (" label ") that identifies the object, and finally the segmentation mask inside the bounding box, as base64 encoded png that is a probability map with values between 0 and 255. The mask needs to be resized to match the bounding box dimensions, then binarized at your confidence threshold (127 for the midpoint). Note: For better results, disable thinking by setting the thinking budget to 0. See code sample below for an example. Python from google import genai from google.genai import types from PIL import Image , ImageDraw import io import base64 import json import numpy as np import os client = genai . Client () def parse_json ( json_output : str ): # Parsing out the markdown fencing lines = json_output . splitlines () for i , line in enumerate ( lines ): if line == "```json" : json_output = " \n " . join ( lines [ i + 1 :]) # Remove everything before "```json" output = json_output . split ( "```" )[ 0 ] # Remove everything after the closing "```" break # Exit the loop once "```json" is found return json_output def extract_segmentation_masks ( image_path : str , output_dir : str = "segmentation_outputs" ): # Load and resize image im = Image . open ( image_path ) im . thumbnail ([ 1024 , 1024 ], Image . Resampling . LANCZOS ) prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks \ No newline at end of file diff --git a/docstore/a1e920dd-87dc-407f-88ce-3e03201b868e b/docstore/a1e920dd-87dc-407f-88ce-3e03201b868e new file mode 100644 index 0000000000000000000000000000000000000000..495316578216786d8cc06cb68d3942d0694abd1d --- /dev/null +++ b/docstore/a1e920dd-87dc-407f-88ce-3e03201b868e @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models#live-api-2.0 Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/a1ed9b23-3d11-49f4-bf45-68babef6c01d b/docstore/a1ed9b23-3d11-49f4-bf45-68babef6c01d new file mode 100644 index 0000000000000000000000000000000000000000..2c0be12e5de5e086f593ed916f2fdecc774a45a4 --- /dev/null +++ b/docstore/a1ed9b23-3d11-49f4-bf45-68babef6c01d @@ -0,0 +1 @@ +Text generation | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Text generation The Gemini API can generate text output from various inputs, including text, images, video, and audio, leveraging Gemini models. Here's a basic example that takes a single text input: Python from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents \ No newline at end of file diff --git a/docstore/a1f1efc9-511e-40fa-b709-0d7f89f146ce b/docstore/a1f1efc9-511e-40fa-b709-0d7f89f146ce new file mode 100644 index 0000000000000000000000000000000000000000..35cb4724ff25e15eeabaea84a051d2545b73055e --- /dev/null +++ b/docstore/a1f1efc9-511e-40fa-b709-0d7f89f146ce @@ -0,0 +1 @@ +model to produce concise responses, you can include examples in the prompt that give preference to concise responses. The following prompt provides two examples that show preference to the shorter explanations. In the response, you can see that the examples guided the model to choose the shorter explanation ( Explanation2 ) as opposed to the longer explanation ( Explanation1 ) like it did previously. Prompt: Below are some examples showing a question, explanation, and answer format: Question: Why is the sky blue? Explanation1: The sky appears blue because of Rayleigh scattering, which causes shorter blue wavelengths of light to be scattered more easily than longer red wavelengths, making the sky look blue. Explanation2: Due to Rayleigh scattering effect. Answer: Explanation2 Question: What is the cause of earthquakes? Explanation1: Sudden release of energy in the Earth's crust. Explanation2: Earthquakes happen when tectonic plates suddenly slip or break apart, causing a release of energy that creates seismic waves that can shake the ground and cause damage. Answer: Explanation1 Now, Answer the following question given the example formats above: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Answer: Explanation2 (gemini-2.5-flash) Optimal number of examples Models like Gemini can often pick up on patterns using a few examples, though you may need to experiment with the number of examples to provide in the prompt for the best results. At the same time, if you include too many examples, the model may start to overfit the response to the examples. Patterns vs anti patterns Using examples to show the model a pattern to follow is more effective than using examples to show the model an anti pattern \ No newline at end of file diff --git a/docstore/a2594657-a6b3-485a-bc11-b5ada78b3263 b/docstore/a2594657-a6b3-485a-bc11-b5ada78b3263 new file mode 100644 index 0000000000000000000000000000000000000000..e83e80a217e9ff779ee42e75d11ac96b01a2f185 --- /dev/null +++ b/docstore/a2594657-a6b3-485a-bc11-b5ada78b3263 @@ -0,0 +1 @@ +function_calling_config = types . FunctionCallingConfig ( mode = "ANY" , allowed_function_names = [ "get_current_temperature" ] ) ) # Create the generation config config = types . GenerateContentConfig ( tools = [ tools ], # not defined here. tool_config = tool_config , ) JavaScript import { FunctionCallingConfigMode } from '@google/genai' ; // Configure function calling mode const toolConfig = { functionCallingConfig : { mode : FunctionCallingConfigMode . ANY , allowedFunctionNames : [ 'get_current_temperature' ] } }; // Create the generation config const config = { tools : tools , // not defined here. toolConfig : toolConfig , }; Automatic function calling (Python only) When using the Python SDK, you can provide Python functions directly as tools. The SDK automatically converts the Python function to declarations, handles the function call execution and the response cycle for you. The Python SDK then automatically: Detects function call responses from the model. Call the corresponding Python function in your code. Sends the function response back to the model. Returns the model's final text response. To use this, define your function with type hints and a docstring, and then pass the function itself (not a JSON declaration) as a tool: Python from google import genai from google.genai import types # Define the function with type hints and docstring def get_current_temperature ( location : str ) - > dict : """Gets the current temperature for a given location. Args: location: The city and state, e.g. San Francisco, CA Returns: A dictionary containing the temperature and unit. """ # ... (implementation) ... return { "temperature" : 25 , "unit" : "Celsius" } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_current_temperature ] ) # Pass the function itself # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What's the temperature in Boston?" , config = config \ No newline at end of file diff --git a/docstore/a262c85c-ea27-4e45-8223-3193b7b46d8a b/docstore/a262c85c-ea27-4e45-8223-3193b7b46d8a new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/a262c85c-ea27-4e45-8223-3193b7b46d8a @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/a2ae3377-5ec1-48be-9401-94a59c69ae71 b/docstore/a2ae3377-5ec1-48be-9401-94a59c69ae71 new file mode 100644 index 0000000000000000000000000000000000000000..8af7573f633bc8337efd3a0ab87cdc8a90abf578 --- /dev/null +++ b/docstore/a2ae3377-5ec1-48be-9401-94a59c69ae71 @@ -0,0 +1 @@ +Studio Our fastest multimodal model with great performance for diverse, repetitive tasks and a 1 million token context window. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge $0.075, prompts <= 128k tokens $0.15, prompts > 128k tokens Output price Free of charge $0.30, prompts <= 128k tokens $0.60, prompts > 128k tokens Context caching price Free of charge, up to 1 million tokens of storage per hour $0.01875, prompts <= 128k tokens $0.0375, prompts > 128k tokens Context caching (storage) Free of charge $1.00 per hour Tuning price Token prices are the same for tuned models Tuning service is free of charge. Token prices are the same for tuned models Tuning service is free of charge. Grounding with Google Search Not available $35 / 1K grounding requests Used to improve our products Yes No Gemini 1.5 Flash-8B Try it in Google AI Studio Our smallest model for lower intelligence use cases, with a 1 million token context window. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge $0.0375, prompts <= 128k tokens $0.075, prompts > 128k tokens Output price Free of charge $0.15, prompts <= 128k tokens $0.30, prompts > 128k tokens Context caching price Free of charge, up to 1 million tokens of storage per hour $0.01, prompts <= 128k tokens $0.02, prompts > 128k tokens Context caching (storage) Free of charge $0.25 per hour Tuning price Token prices are the same for tuned models Tuning service is free of charge. Token prices are the same for tuned models Tuning service is free of charge. Grounding with Google Search Not available $35 / 1K grounding requests Used to improve our products Yes No Gemini 1.5 Pro Try it in Google AI Studio Our highest intelligence Gemini 1.5 series model, with a breakthrough 2 million token context window. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge $1.25, prompts <= 128k tokens $2.50, prompts > 128k tokens Output price Free of charge $5.00, prompts <= 128k tokens $10.00, prompts > 128k \ No newline at end of file diff --git a/docstore/a2ae6877-0ecf-44a3-bd66-5238f86e93f5 b/docstore/a2ae6877-0ecf-44a3-bd66-5238f86e93f5 new file mode 100644 index 0000000000000000000000000000000000000000..7c883bd368c31d390cf31dfb7ab8807048f20c67 --- /dev/null +++ b/docstore/a2ae6877-0ecf-44a3-bd66-5238f86e93f5 @@ -0,0 +1 @@ +caching price Not available $0.075 (text / image / video) $0.25 (audio) $1.00 / 1,000,000 tokens per hour (storage price) Grounding with Google Search Free of charge, up to 500 RPD (limit shared with Flash-Lite RPD) 1,500 RPD (free, limit shared with Flash-Lite RPD), then $35 / 1,000 requests Live API Free of charge Input: $0.50 (text), $3.00 (audio / image [video]) Output: $2.00 (text), $12.00 (audio) Used to improve our products Yes No Gemini 2.5 Flash-Lite Preview Try it in Google AI Studio Our smallest and most cost effective model, built for at scale usage. Preview models may change before becoming stable and have more restrictive rate limits. Free Tier Paid Tier, per 1M tokens in USD Input price (text, image, video) Free of charge $0.10 (text / image / video) $0.50 (audio) Output price (including thinking tokens) Free of charge $0.40 Context caching price Not available $0.025 (text / image / video) $0.125 (audio) $1.00 / 1,000,000 tokens per hour (storage price) Grounding with Google Search Free of charge, up to 500 RPD (limit shared with Flash RPD) 1,500 RPD (free, limit shared with Flash RPD), then $35 / 1,000 requests Used to improve our products Yes No Gemini 2.5 Flash Native Audio Try it in Google AI Studio Our native audio models optimized for higher quality audio outputs with better pacing, voice naturalness, verbosity, and mood. Preview models may change before becoming stable and have more restrictive rate limits. Free Tier Paid Tier, per 1M tokens in USD Input price Not available $0.50 (text) $3.00 (audio / video) Output price (including thinking tokens) Not available $2.00 (text) $12.00 (audio) Used to improve our products Yes No Gemini 2.5 Flash Preview TTS Try it in Google AI Studio Our 2.5 Flash text-to-speech audio model optimized for price-performant, low-latency, controllable speech generation. Preview models may change before becoming stable and have more restrictive rate limits. Free Tier Paid Tier, per 1M tokens in USD Input price Free of \ No newline at end of file diff --git a/docstore/a2b8f78c-5388-4ed6-b956-82c840b3e19f b/docstore/a2b8f78c-5388-4ed6-b956-82c840b3e19f new file mode 100644 index 0000000000000000000000000000000000000000..cc967ffee72bfb803a27c22b45a7304a203928ac --- /dev/null +++ b/docstore/a2b8f78c-5388-4ed6-b956-82c840b3e19f @@ -0,0 +1 @@ +the model to parse the problem before answering the prompt. Prompt Response Parse the formula in the image first. Then based on the formula, answer "what's the 4th term in the sequence?". Click to expand the result $$ b(n) = b(n-1) * (-3) $$ $$ b(1) = 15 $$ Substitute n = 4 into the formula: $$ b(4) = b(4-1) * (-3) $$ $$ b(4) = b(3) * (-3) $$ We don't know the value of b(3), so substitute n = 3 into the formula: $$ b(3) = b(3-1) * (-3) $$ $$ b(3) = b(2) * (-3) $$ We don't know the value of b(2), so substitute n = 2 into the formula: $$ b(2) = b(2-1) * (-3) $$ $$ b(2) = b(1) * (-3) $$ We know the value of b(1) is 15, so substitute it into the formula: $$ b(2) = 15 * (-3) $$ $$ b(2) = -45 $$ Now that we know the value of b(2), substitute it into the formula for b(3): $$ b(3) = b(2) * (-3) $$ $$ b(3) = -45 * (-3) $$ $$ b(3) = 135 $$ Now that we know the value of b(3), substitute it into the formula for b(4): $$ b(4) = b(3) * (-3) $$ $$ b(4) = 135 * (-3) $$ $$ b(4) = -405 $$ Therefore, the 4th term in the sequence is -405. This model response is correct - now to make the response even more streamlined, you can ask the model to to think step-by-step. Updated prompt Improved response What is the 4th term in the sequence? Think step by step. The first term is 15. The second term is b(1) * (-3) = 15 * (-3) = -45. The third term is b(2) * (-3) = -45 * (-3) = 135. The fourth term is b(3) * (-3) = 135 * (-3) = -405. Specify the output format A common problem is the need for model output to be in a certain format (e.g. markdown, JSON, HTML), especially if the model output needs to be ingested by a downstream task. You can try to produce output in that format by instructing the model to do so within the prompt. Below are two examples: Prompt Model response Parse the table in this image into markdown format | Subject | Status | Time | Start date | Due on | | ------- | ------ | ---- | ---------- | ------ | | Math | Done | 1.5 hrs | 11/29/2023 | 12/1/2023 | | Geography | Done | 30 \ No newline at end of file diff --git a/docstore/a2cd0c7d-dc32-4218-8145-38cc734529ee b/docstore/a2cd0c7d-dc32-4218-8145-38cc734529ee new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/a2cd0c7d-dc32-4218-8145-38cc734529ee @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/a2f1a3aa-b107-4e96-b67c-731ea9701c75 b/docstore/a2f1a3aa-b107-4e96-b67c-731ea9701c75 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/a2f1a3aa-b107-4e96-b67c-731ea9701c75 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/a2f80902-33f5-44ef-8299-7d189f9df1d0 b/docstore/a2f80902-33f5-44ef-8299-7d189f9df1d0 new file mode 100644 index 0000000000000000000000000000000000000000..5b23b75839f7d9f5e86c0814ceb13216aba4c820 --- /dev/null +++ b/docstore/a2f80902-33f5-44ef-8299-7d189f9df1d0 @@ -0,0 +1 @@ +Using Gemini API keys | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Using Gemini API keys To use the Gemini API, you need an API key. You can create a key for free with a few clicks in Google AI Studio . Once you have an API key, you have the following options to connect to the Gemini API: Setting your API key as an environment variable Providing your API key explicitly For initial testing, you can hard code an API key, but this should only be temporary since it's not secure. You can find examples for hard coding the API key in Providing API key explicitly section. Setting API key as environment variable If you set the environment variable GEMINI_API_KEY or GOOGLE_API_KEY , the API key will automatically be picked up by the client when using one of the Gemini API libraries . It's recommended that you set only one of those variables, but if both are set, GOOGLE_API_KEY takes precedence. If you're using the REST API, or JavaScript on the browser, you will need to provide the API key explicitly. Here is how you can set your API key locally as the environment variable GEMINI_API_KEY with different operating systems. Linux/macOS - Bash Bash is a common Linux and macOS terminal configuration. You can check if you have a configuration file for it by running the following command: ~/.bashrc If the response is "No such file or directory", you will need to create this file and open it by running the following commands, or use zsh : touch ~/.bashrc open ~/.bashrc Next, you need to set your API key by adding the following export command: export GEMINI_API_KEY = After saving the file, apply the changes by running: source ~/.bashrc macOS \ No newline at end of file diff --git a/docstore/a2f86e47-016e-4789-8f89-8cf29c98b077 b/docstore/a2f86e47-016e-4789-8f89-8cf29c98b077 new file mode 100644 index 0000000000000000000000000000000000000000..7dc87b548e2d57526821a9c12df5e47c7e7e0e83 --- /dev/null +++ b/docstore/a2f86e47-016e-4789-8f89-8cf29c98b077 @@ -0,0 +1 @@ +. thoughtsTokenCount } ` ); console . log ( `Output tokens: ${ response . usageMetadata . candidatesTokenCount } ` ); Go // ... usageMetadata , err := json . MarshalIndent ( response . UsageMetadata , "" , " " ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Thoughts tokens:" , string ( usageMetadata . thoughts_token_count )) fmt . Println ( "Output tokens:" , string ( usageMetadata . candidates_token_count )) Thinking models generate full thoughts to improve the quality of the final response, and then output summaries to provide insight into the thought process. So, pricing is based on the full thought tokens the model needs to generate to create a summary, despite only the summary being output from the API. You can learn more about tokens in the Token counting guide. Supported models Thinking features are supported on all the 2.5 series models. You can find all model capabilities on the model overview page. Best practices This section includes some guidance for using thinking models efficiently. As always, following our prompting guidance and best practices will get you the best results. Debugging and steering Review reasoning : When you're not getting your expected response from the thinking models, it can help to carefully analyze Gemini's thought summaries. You can see how it broke down the task and arrived at its conclusion, and use that information to correct towards the right results. Provide Guidance in Reasoning : If you're hoping for a particularly lengthy output, you may want to provide guidance in your prompt to constrain the amount of thinking the model uses. This lets you reserve more of the token output for your response. Task complexity Easy Tasks (Thinking could be OFF): For straightforward requests where complex reasoning isn't required, such as fact retrieval or classification, thinking is not required. Examples include: "Where was DeepMind founded?" "Is this email asking for a meeting or just providing information?" Medium Tasks \ No newline at end of file diff --git a/docstore/a313e63e-a072-4f33-81c1-9c7a218e5508 b/docstore/a313e63e-a072-4f33-81c1-9c7a218e5508 new file mode 100644 index 0000000000000000000000000000000000000000..6adba52776c85866a9f7e3e3060c0512ed2f447c --- /dev/null +++ b/docstore/a313e63e-a072-4f33-81c1-9c7a218e5508 @@ -0,0 +1 @@ +about image prompting, see the Imagen prompt guide To learn about video prompting, see the Veo prompt guide Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-04-28 UTC. \ No newline at end of file diff --git a/docstore/a327a258-7f8c-477e-b010-d325e09e6ac0 b/docstore/a327a258-7f8c-477e-b010-d325e09e6ac0 new file mode 100644 index 0000000000000000000000000000000000000000..8b1657bd0e8a588bad85661db832fbb0d660fdd6 --- /dev/null +++ b/docstore/a327a258-7f8c-477e-b010-d325e09e6ac0 @@ -0,0 +1 @@ +serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Incremental content updates Use incremental updates to send text input, establish session context, or restore session context. For short contexts you can send turn-by-turn interactions to represent the exact sequence of events: Python turns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }]}, { "role" : "model" , "parts" : [{ "text" : "Paris" }]}, ] await session . send_client_content ( turns = turns , turn_complete = False ) turns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }]}] await session . send_client_content ( turns = turns , turn_complete = True ) JavaScript let inputTurns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }] }, { "role" : "model" , "parts" : [{ "text" : "Paris" }] }, ] session . sendClientContent ({ turns : inputTurns , turnComplete : false }) inputTurns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }] }] session . sendClientContent ({ turns : inputTurns , turnComplete : true }) For longer contexts \ No newline at end of file diff --git a/docstore/a33b8854-0c5d-448a-bcab-9eb4ec5c1fbc b/docstore/a33b8854-0c5d-448a-bcab-9eb4ec5c1fbc new file mode 100644 index 0000000000000000000000000000000000000000..44872cf457d184fbc5f51c9428c250f67e74d945 --- /dev/null +++ b/docstore/a33b8854-0c5d-448a-bcab-9eb4ec5c1fbc @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/live-tools#tools-overview Title: Tool use with Live API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/a35a94cc-d5ed-4a18-9c33-f2e1c5804ff8 b/docstore/a35a94cc-d5ed-4a18-9c33-f2e1c5804ff8 new file mode 100644 index 0000000000000000000000000000000000000000..aee05449de83b11c2592ad958994cdaf02f0141d --- /dev/null +++ b/docstore/a35a94cc-d5ed-4a18-9c33-f2e1c5804ff8 @@ -0,0 +1 @@ +done = true ; } else if ( message . toolCall ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Turn on the lights please' ; session . sendClientContent ({ turns : inputTurns }); let turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . modelTurn && turn . serverContent . modelTurn . parts ) { for ( const part of turn . serverContent . modelTurn . parts ) { if ( part . text ) { console . debug ( 'Received text: %s\n' , part . text ); } } } else if ( turn . toolCall ) { const functionResponses = []; for ( const fc of turn . toolCall . functionCalls ) { functionResponses . push ({ id : fc . id , name : fc . name , response : { result : "ok" } // simple, hard-coded function response }); } console . debug ( 'Sending tool response...\n' ); session . sendToolResponse ({ functionResponses : functionResponses }); } } // Check again for new messages turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . modelTurn && turn . serverContent . modelTurn . parts ) { for ( const part of turn . serverContent . modelTurn . parts ) { if ( part . text ) { console . debug ( 'Received text: %s\n' , part . text ); } } } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); From a single prompt, the model can generate multiple function calls and the code necessary to chain their outputs. This code executes in a sandbox environment, generating subsequent BidiGenerateContentToolCall messages. Asynchronous function calling Note: \ No newline at end of file diff --git a/docstore/a3704544-c141-412e-884f-dc6105fd0639 b/docstore/a3704544-c141-412e-884f-dc6105fd0639 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/a3704544-c141-412e-884f-dc6105fd0639 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/a3816d70-2007-4ec4-bc58-d6bc957955a4 b/docstore/a3816d70-2007-4ec4-bc58-d6bc957955a4 new file mode 100644 index 0000000000000000000000000000000000000000..e83e80a217e9ff779ee42e75d11ac96b01a2f185 --- /dev/null +++ b/docstore/a3816d70-2007-4ec4-bc58-d6bc957955a4 @@ -0,0 +1 @@ +function_calling_config = types . FunctionCallingConfig ( mode = "ANY" , allowed_function_names = [ "get_current_temperature" ] ) ) # Create the generation config config = types . GenerateContentConfig ( tools = [ tools ], # not defined here. tool_config = tool_config , ) JavaScript import { FunctionCallingConfigMode } from '@google/genai' ; // Configure function calling mode const toolConfig = { functionCallingConfig : { mode : FunctionCallingConfigMode . ANY , allowedFunctionNames : [ 'get_current_temperature' ] } }; // Create the generation config const config = { tools : tools , // not defined here. toolConfig : toolConfig , }; Automatic function calling (Python only) When using the Python SDK, you can provide Python functions directly as tools. The SDK automatically converts the Python function to declarations, handles the function call execution and the response cycle for you. The Python SDK then automatically: Detects function call responses from the model. Call the corresponding Python function in your code. Sends the function response back to the model. Returns the model's final text response. To use this, define your function with type hints and a docstring, and then pass the function itself (not a JSON declaration) as a tool: Python from google import genai from google.genai import types # Define the function with type hints and docstring def get_current_temperature ( location : str ) - > dict : """Gets the current temperature for a given location. Args: location: The city and state, e.g. San Francisco, CA Returns: A dictionary containing the temperature and unit. """ # ... (implementation) ... return { "temperature" : 25 , "unit" : "Celsius" } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_current_temperature ] ) # Pass the function itself # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What's the temperature in Boston?" , config = config \ No newline at end of file diff --git a/docstore/a3874c49-b3ae-4242-b7e1-9577c15403e6 b/docstore/a3874c49-b3ae-4242-b7e1-9577c15403e6 new file mode 100644 index 0000000000000000000000000000000000000000..b0571e28c8e74f7e3e23139b08c0865b24edbd38 --- /dev/null +++ b/docstore/a3874c49-b3ae-4242-b7e1-9577c15403e6 @@ -0,0 +1 @@ +And you can also pass the schema as JSON: Python from google import genai client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = 'What type of instrument is an oboe?' , config = { 'response_mime_type' : 'text/x.enum' , 'response_schema' : { "type" : "STRING" , "enum" : [ "Percussion" , "String" , "Woodwind" , "Brass" , "Keyboard" ], }, }, ) print ( response . text ) # Woodwind Beyond basic multiple choice problems, you can use an enum anywhere in a JSON schema. For example, you could ask the model for a list of recipe titles and use a Grade enum to give each title a popularity grade: Python from google import genai import enum from pydantic import BaseModel class Grade ( enum . Enum ): A_PLUS = "a+" A = "a" B = "b" C = "c" D = "d" F = "f" class Recipe ( BaseModel ): recipe_name : str rating : Grade client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = 'List 10 home-baked cookie recipes and give them grades based on tastiness.' , config = { 'response_mime_type' : 'application/json' , 'response_schema' : list [ Recipe ], }, ) print ( response . text ) The response might look like this: [ { "recipe_name" : "Chocolate Chip Cookies" , "rating" : "a+" }, { "recipe_name" : "Peanut Butter Cookies" , "rating" : "a" }, { "recipe_name" : "Oatmeal Raisin Cookies" , "rating" : "b" }, ... ] About JSON schemas Configuring the model for JSON output using responseSchema parameter relies on Schema object to define its structure. This object represents a select subset of the OpenAPI 3.0 Schema object , and also adds a propertyOrdering field. Tip: On Python, when you use a Pydantic model, you don't need to directly work with Schema objects, as it gets automatically converted to the corresponding JSON schema. To learn more, see JSON schemas in Python . Here's a pseudo-JSON representation of all the Schema fields: { "type": enum (Type), "format": string, "description": \ No newline at end of file diff --git a/docstore/a39b26cd-6cac-4c99-887c-7fa4169689cc b/docstore/a39b26cd-6cac-4c99-887c-7fa4169689cc new file mode 100644 index 0000000000000000000000000000000000000000..cc967ffee72bfb803a27c22b45a7304a203928ac --- /dev/null +++ b/docstore/a39b26cd-6cac-4c99-887c-7fa4169689cc @@ -0,0 +1 @@ +the model to parse the problem before answering the prompt. Prompt Response Parse the formula in the image first. Then based on the formula, answer "what's the 4th term in the sequence?". Click to expand the result $$ b(n) = b(n-1) * (-3) $$ $$ b(1) = 15 $$ Substitute n = 4 into the formula: $$ b(4) = b(4-1) * (-3) $$ $$ b(4) = b(3) * (-3) $$ We don't know the value of b(3), so substitute n = 3 into the formula: $$ b(3) = b(3-1) * (-3) $$ $$ b(3) = b(2) * (-3) $$ We don't know the value of b(2), so substitute n = 2 into the formula: $$ b(2) = b(2-1) * (-3) $$ $$ b(2) = b(1) * (-3) $$ We know the value of b(1) is 15, so substitute it into the formula: $$ b(2) = 15 * (-3) $$ $$ b(2) = -45 $$ Now that we know the value of b(2), substitute it into the formula for b(3): $$ b(3) = b(2) * (-3) $$ $$ b(3) = -45 * (-3) $$ $$ b(3) = 135 $$ Now that we know the value of b(3), substitute it into the formula for b(4): $$ b(4) = b(3) * (-3) $$ $$ b(4) = 135 * (-3) $$ $$ b(4) = -405 $$ Therefore, the 4th term in the sequence is -405. This model response is correct - now to make the response even more streamlined, you can ask the model to to think step-by-step. Updated prompt Improved response What is the 4th term in the sequence? Think step by step. The first term is 15. The second term is b(1) * (-3) = 15 * (-3) = -45. The third term is b(2) * (-3) = -45 * (-3) = 135. The fourth term is b(3) * (-3) = 135 * (-3) = -405. Specify the output format A common problem is the need for model output to be in a certain format (e.g. markdown, JSON, HTML), especially if the model output needs to be ingested by a downstream task. You can try to produce output in that format by instructing the model to do so within the prompt. Below are two examples: Prompt Model response Parse the table in this image into markdown format | Subject | Status | Time | Start date | Due on | | ------- | ------ | ---- | ---------- | ------ | | Math | Done | 1.5 hrs | 11/29/2023 | 12/1/2023 | | Geography | Done | 30 \ No newline at end of file diff --git a/docstore/a39e68a9-17ce-428b-931f-9c258742e62c b/docstore/a39e68a9-17ce-428b-931f-9c258742e62c new file mode 100644 index 0000000000000000000000000000000000000000..15267ad406ab5b1a17ee9a318ef01a462d6513da --- /dev/null +++ b/docstore/a39e68a9-17ce-428b-931f-9c258742e62c @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/google-search#main-content Title: Grounding with Google Search | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/a3ac49cf-1dae-4c56-9772-ad18700e5fae b/docstore/a3ac49cf-1dae-4c56-9772-ad18700e5fae new file mode 100644 index 0000000000000000000000000000000000000000..024dbd78fa7ec55b1f91d59d77cedd6de4a8bd83 --- /dev/null +++ b/docstore/a3ac49cf-1dae-4c56-9772-ad18700e5fae @@ -0,0 +1 @@ +requiring re-initiation of the provisioning process. Verify secure authentication for your own backend. Ephemeral tokens will only be as secure as your backend authentication method. Generally, avoid using ephemeral tokens for backend-to-Gemini connections, as this path is typically considered secure. Limitations Ephemeral tokens are only compatible with Live API at this time. What's next Read the Live API reference on ephemeral tokens for more information. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC. \ No newline at end of file diff --git a/docstore/a3acfbf4-d9b6-4e17-94ed-1848f8a99a55 b/docstore/a3acfbf4-d9b6-4e17-94ed-1848f8a99a55 new file mode 100644 index 0000000000000000000000000000000000000000..aef01da97801860cabcd3fb68af1ef57ccf11af0 --- /dev/null +++ b/docstore/a3acfbf4-d9b6-4e17-94ed-1848f8a99a55 @@ -0,0 +1 @@ +Speech generation (text-to-speech) | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Speech generation (text-to-speech) The Gemini API can transform text input into single speaker or multi-speaker audio using native text-to-speech (TTS) generation capabilities. Text-to-speech (TTS) generation is controllable , meaning you can use natural language to structure interactions and guide the style , accent , pace , and tone of the audio. The TTS capability differs from speech generation provided through the Live API , which is designed for interactive, unstructured audio, and multimodal inputs and outputs. While the Live API excels in dynamic conversational contexts, TTS through the Gemini API is tailored for scenarios that require exact text recitation with fine-grained control over style and sound, such as podcast or audiobook generation. This guide shows you how to generate single-speaker and multi-speaker audio from text. Preview: Native text-to-speech (TTS) is in Preview . Before you begin Ensure you use a Gemini 2.5 model variant with native text-to-speech (TTS) capabilities, as listed in the Supported models section. For optimal results, consider which model best fits your specific use case. You may find it useful to test the Gemini 2.5 TTS models in AI Studio before you start building. Note: TTS models accept text-only inputs and produce audio-only outputs. For a complete list of restrictions specific to TTS models, review the Limitations section. Single-speaker text-to-speech To convert text to single-speaker audio, set the response modality to "audio", and pass a SpeechConfig object with VoiceConfig set. You'll need to choose a \ No newline at end of file diff --git a/docstore/a3bd57a7-8eee-4a6a-8424-9fe287050c02 b/docstore/a3bd57a7-8eee-4a6a-8424-9fe287050c02 new file mode 100644 index 0000000000000000000000000000000000000000..a70be0d04647e3405f67cb87b3156d8922b7c210 --- /dev/null +++ b/docstore/a3bd57a7-8eee-4a6a-8424-9fe287050c02 @@ -0,0 +1 @@ +to the examples provides labels that the model can use when generating the output, which makes it easier to parse output content. In the following example, "Text:" is the input prefix and "The answer is:" is the output prefix. Prompt: Classify the text as one of the following categories. - large - small Text: Rhino The answer is: large Text: Mouse The answer is: small Text: Snail The answer is: small Text: Elephant The answer is: Response: The answer is: large (gemini-2.5-flash) Break down prompts into components For use cases that require complex prompts, you can help the model manage this complexity by breaking things down into simpler components. Break down instructions: Instead of having many instructions in one prompt, create one prompt per instruction. You can choose which prompt to process based on the user's input. Chain prompts: For complex tasks that involve multiple sequential steps, make each step a prompt and chain the prompts together in a sequence. In this sequential chain of prompts, the output of one prompt in the sequence becomes the input of the next prompt. The output of the last prompt in the sequence is the final output. Aggregate responses: Aggregation is when you want to perform different parallel tasks on different portions of the data and aggregate the results to produce the final output. For example, you can tell the model to perform one operation on the first part of the data, perform another operation on the rest of the data and aggregate the results. Experiment with model parameters Each call that you send to a model includes parameter values that control how the model generates a response. The model can generate different results for different parameter values. Experiment with different parameter values to get the best values for the task. The parameters available for different models may differ. The most common parameters are the following: Max output tokens: Specifies the maximum number of tokens that can be generated in the \ No newline at end of file diff --git a/docstore/a3ed518b-e1e6-4288-82a3-02879061db24 b/docstore/a3ed518b-e1e6-4288-82a3-02879061db24 new file mode 100644 index 0000000000000000000000000000000000000000..b6194c8105fe9f40c0d9a89b00594ca4d33e213c --- /dev/null +++ b/docstore/a3ed518b-e1e6-4288-82a3-02879061db24 @@ -0,0 +1 @@ +While meeting the stated qualification criteria is generally sufficient for approval, in rare cases an upgrade request may be denied based on other factors identified during the review process. This system helps maintain the security and integrity of the Gemini API platform for all users. Standard API rate limits The following table lists the rate limits for all standard Gemini API calls. Free Tier Model RPM TPM RPD Gemini 2.5 Pro 5 250,000 100 Gemini 2.5 Flash 10 250,000 250 Gemini 2.5 Flash-Lite Preview 06-17 15 250,000 1,000 Gemini 2.5 Flash Preview TTS 3 10,000 15 Gemini 2.5 Pro Preview TTS -- -- -- Gemini 2.0 Flash 15 1,000,000 200 Gemini 2.0 Flash Preview Image Generation 10 200,000 100 Gemini 2.0 Flash-Lite 30 1,000,000 200 Imagen 3 -- -- -- Veo 2 -- -- -- Gemini 1.5 Flash (Deprecated) 15 250,000 50 Gemini 1.5 Flash-8B (Deprecated) 15 250,000 50 Gemini 1.5 Pro (Deprecated) -- -- -- Gemma 3 & 3n 30 15,000 14,400 Gemini Embedding Experimental 03-07 5 -- 100 Tier 1 Model RPM TPM RPD Gemini 2.5 Pro 150 2,000,000 1,000 Gemini 2.5 Flash 1,000 1,000,000 10,000 Gemini 2.5 Flash-Lite Preview 06-17 4,000 4,000,000 -- Gemini 2.5 Flash Preview TTS 10 10,000 100 Gemini 2.5 Pro Preview TTS 10 10,000 50 Gemini 2.0 Flash 2,000 4,000,000 -- Gemini 2.0 Flash Preview Image Generation 1,000 1,000,000 10,000 Gemini 2.0 Flash-Lite 4,000 4,000,000 -- Imagen 4 Standard 10 -- 70 Imagen 4 Ultra 5 -- 30 Imagen 3 20 -- -- Veo 2 2 videos per minute -- 50 videos per day Gemini 1.5 Flash (Deprecated) 2,000 4,000,000 -- Gemini 1.5 Flash-8B (Deprecated) 4,000 4,000,000 -- Gemini 1.5 Pro (Deprecated) 1,000 4,000,000 -- Gemma 3 & 3n 30 15,000 14,400 Gemini Embedding Experimental 03-07 10 -- 1,000 Tier 2 Model RPM TPM RPD Gemini 2.5 Pro 1,000 5,000,000 50,000 Gemini 2.5 Flash 2,000 3,000,000 100,000 Gemini 2.5 Flash-Lite Preview 06-17 10,000 10,000,000 100,000 Gemini 2.5 Flash Preview TTS 1,000 100,000 10,000 Gemini 2.5 Pro Preview TTS 100 25,000 1,000 Gemini 2.0 Flash 10,000 10,000,000 -- \ No newline at end of file diff --git a/docstore/a3fbc7b4-258b-40d2-8ece-ee56156e8434 b/docstore/a3fbc7b4-258b-40d2-8ece-ee56156e8434 new file mode 100644 index 0000000000000000000000000000000000000000..c6b31d98c2b16f02ca76b6157fd854c0b53727af --- /dev/null +++ b/docstore/a3fbc7b4-258b-40d2-8ece-ee56156e8434 @@ -0,0 +1 @@ +: [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Thinking with Gemini 2.5 2.5 Flash and Pro models have "thinking" enabled by default to enhance quality, which may take longer to run and increase token usage. When using 2.5 Flash, you can disable thinking by setting the thinking budget to zero. For more details, see the thinking guide . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # Disables thinking ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , config : { thinkingConfig : { thinkingBudget : 0 , // Disables thinking }, } }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "How does AI work?" ), & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : int32 ( 0 ), // Disables thinking }, } ) \ No newline at end of file diff --git a/docstore/a3fc0b3c-788a-4a2a-8c7a-9b0f8e467087 b/docstore/a3fc0b3c-788a-4a2a-8c7a-9b0f8e467087 new file mode 100644 index 0000000000000000000000000000000000000000..b362fdd58ed7301c466f0b3a048e65a061fc1b90 --- /dev/null +++ b/docstore/a3fc0b3c-788a-4a2a-8c7a-9b0f8e467087 @@ -0,0 +1 @@ +"messages": [ {"role": "user", "content": "Explain to me how AI works"} ] }' Gemini thinking models also produce thought summaries and can use exact thinking budgets . You can use the extra_body field to include these fields in your request. Note that reasoning_effort and thinking_budget overlap functionality, so they can't be used at the same time. Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.5-flash" , messages = [{ "role" : "user" , "content" : "Explain to me how AI works" }], extra_body = { 'extra_body' : { "google" : { "thinking_config" : { "thinking_budget" : 800 , "include_thoughts" : True } } } } ) print ( response . choices [ 0 ] . message ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); const response = await openai . chat . completions . create ({ model : "gemini-2.5-flash" , messages : [{ role : "user" , content : "Explain to me how AI works" ,}], extra_body : { "google" : { "thinking_config" : { "thinking_budget" : 800 , "include_thoughts" : true } } } }); console . log ( response . choices [ 0 ]. message ); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.5-flash", "messages": [{"role": "user", "content": "Explain to me how AI works"}], "extra_body": { "google": { "thinking_config": { "include_thoughts": true } } } }' Streaming The Gemini API supports streaming responses . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = [ { \ No newline at end of file diff --git a/docstore/a40ba3e8-6c7a-4a83-abda-dda9d0f26e22 b/docstore/a40ba3e8-6c7a-4a83-abda-dda9d0f26e22 new file mode 100644 index 0000000000000000000000000000000000000000..6479a4b50897c899a1b9742e0d69348c2776f1d5 --- /dev/null +++ b/docstore/a40ba3e8-6c7a-4a83-abda-dda9d0f26e22 @@ -0,0 +1 @@ +config = types . GenerateImagesConfig ( aspect_ratio = "16:9" , number_of_images = 1 ) ) imagen . generated_images [ 0 ] . image JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const response = await ai . models . generateImages ({ model : "imagen-3.0-generate-002" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , config : { numberOfImages : 1 , }, }); // you'll pass response.generatedImages[0].image.imageBytes to Veo Go package main import ( "context" "fmt" "os" "time" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateImagesConfig { AspectRatio : "16:9" , NumberOfImages : 1 , } response , _ := client . Models . GenerateImages ( ctx , "imagen-3.0-generate-002" , "Panning wide shot of a calico kitten sleeping in the sunshine" , config , ) // you'll pass response.GeneratedImages[0].Image to Veo } Then, generate a video using the resulting image as the first frame: Python operation = client . models . generate_videos ( model = "veo-2.0-generate-001" , prompt = prompt , image = imagen . generated_images [ 0 ] . image , config = types . GenerateVideosConfig ( person_generation = "dont_allow" , # "dont_allow" or "allow_adult" aspect_ratio = "16:9" , # "16:9" or "9:16" number_of_videos = 2 ), ) # Wait for videos to generate while not operation . done : time . sleep ( 20 ) operation = client . operations . get ( operation ) for n , video in enumerate ( operation . response . generated_videos ): fname = f 'with_image_input { n } .mp4' print ( fname ) client . files . download ( file = video . video ) video . video . save ( fname ) JavaScript import { GoogleGenAI } from "@google/genai" ; import { createWriteStream } from "fs" ; import { Readable } from "stream" ; const ai = new GoogleGenAI ({}); async function main () { // get image bytes from Imagen, as shown above let \ No newline at end of file diff --git a/docstore/a43aefc1-9c54-477c-9f9d-888a16dd2af5 b/docstore/a43aefc1-9c54-477c-9f9d-888a16dd2af5 new file mode 100644 index 0000000000000000000000000000000000000000..eeaa745b8119787addf02809d9d1b660f835f8e5 --- /dev/null +++ b/docstore/a43aefc1-9c54-477c-9f9d-888a16dd2af5 @@ -0,0 +1 @@ +Function calling with the Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Function calling with the Gemini API Function calling lets you connect models to external tools and APIs. Instead of generating text responses, the model determines when to call specific functions and provides the necessary parameters to execute real-world actions. This allows the model to act as a bridge between natural language and real-world actions and data. Function calling has 3 primary use cases: Augment Knowledge: Access information from external sources like databases, APIs, and knowledge bases. Extend Capabilities: Use external tools to perform computations and extend the limitations of the model, such as using a calculator or creating charts. Take Actions: Interact with external systems using APIs, such as scheduling appointments, creating invoices, sending emails, or controlling smart home devices. Get Weather Schedule Meeting Create Chart Python from google import genai from google.genai import types # Define the function declaration for the model weather_function = { "name" : "get_current_temperature" , "description" : "Gets the current temperature for a given location." , "parameters" : { "type" : "object" , "properties" : { "location" : { "type" : "string" , "description" : "The city name, e.g. San Francisco" , }, }, "required" : [ "location" ], }, } # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ weather_function ]) config = types . GenerateContentConfig ( tools = [ tools ]) # Send request with function declarations response = client . models . generate_content ( model = \ No newline at end of file diff --git a/docstore/a43bca83-c2b9-42ff-a7ff-b822810a5e22 b/docstore/a43bca83-c2b9-42ff-a7ff-b822810a5e22 new file mode 100644 index 0000000000000000000000000000000000000000..e44291587802102f7099723c593f8748408a6ab5 --- /dev/null +++ b/docstore/a43bca83-c2b9-42ff-a7ff-b822810a5e22 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#token-size Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/a4482dda-006d-4a9c-9636-376b0d1ce76a b/docstore/a4482dda-006d-4a9c-9636-376b0d1ce76a new file mode 100644 index 0000000000000000000000000000000000000000..47c113cd9bacf84a434d531b6a240f0025b74130 --- /dev/null +++ b/docstore/a4482dda-006d-4a9c-9636-376b0d1ce76a @@ -0,0 +1 @@ +. For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC. \ No newline at end of file diff --git a/docstore/a46be37f-9e89-43cf-a204-b7a359c42ffb b/docstore/a46be37f-9e89-43cf-a204-b7a359c42ffb new file mode 100644 index 0000000000000000000000000000000000000000..48ce7760ed3b3e078bbb96293e0e67132c5a10c7 --- /dev/null +++ b/docstore/a46be37f-9e89-43cf-a204-b7a359c42ffb @@ -0,0 +1 @@ +Video understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Video understanding Gemini models can process videos, enabling many frontier developer use cases that would have historically required domain specific models. Some of Gemini's vision capabilities include the ability to: Describe, segment, and extract information from videos Answer questions about video content Refer to specific timestamps within a video Gemini was built to be multimodal from the ground up and we continue to push the frontier of what is possible. This guide shows how to use the Gemini API to generate text responses based on video inputs. Video input You can provide videos as input to Gemini in the following ways: Upload a video file using the File API before making a request to generateContent . Use this method for files larger than 20MB, videos longer than approximately 1 minute, or when you want to reuse the file across multiple requests. Pass inline video data with the request to generateContent . Use this method for smaller files (<20MB) and shorter durations. Include a YouTube URL directly in the prompt. Upload a video file You can use the Files API to upload a video file. Always use the Files API when the total request size (including the file, text prompt, system instructions, etc.) is larger than 20 MB, the video duration is significant, or if you intend to use the same video in multiple prompts. The File API accepts video file formats directly. This example uses the short NASA film "Jupiter's Great Red Spot Shrinks and Grows" . Credit: Goddard Space Flight Center (GSFC)/David Ladd (2018). "Jupiter's Great Red Spot Shrinks and Grows" is in the \ No newline at end of file diff --git a/docstore/a48f58b3-af58-40c9-8f3e-b002786e65d1 b/docstore/a48f58b3-af58-40c9-8f3e-b002786e65d1 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/a48f58b3-af58-40c9-8f3e-b002786e65d1 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/a49f3874-986d-4f35-9e71-fabc62aa5fa9 b/docstore/a49f3874-986d-4f35-9e71-fabc62aa5fa9 new file mode 100644 index 0000000000000000000000000000000000000000..2b6e55e3ae415c04ff420e9e56413156ffa5e0fd --- /dev/null +++ b/docstore/a49f3874-986d-4f35-9e71-fabc62aa5fa9 @@ -0,0 +1 @@ +a sample rate of 24kHz. Python # Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav # Install helpers for converting files: pip install librosa soundfile import asyncio import io from pathlib import Path import wave from google import genai from google.genai import types import soundfile as sf import librosa client = genai . Client () # Half cascade model: # model = "gemini-live-2.5-flash-preview" # Native audio output model: model = "gemini-2.5-flash-preview-native-audio-dialog" config = { "response_modalities" : [ "AUDIO" ], "system_instruction" : "You are a helpful assistant and answer in a friendly tone." , } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : buffer = io . BytesIO () y , sr = librosa . load ( "sample.wav" , sr = 16000 ) sf . write ( buffer , y , sr , format = 'RAW' , subtype = 'PCM_16' ) buffer . seek ( 0 ) audio_bytes = buffer . read () # If already in correct format, you can use this: # audio_bytes = Path("sample.pcm").read_bytes() await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) wf = wave . open ( "audio.wav" , "wb" ) wf . setnchannels ( 1 ) wf . setsampwidth ( 2 ) wf . setframerate ( 24000 ) # Output is 24kHz async for response in session . receive (): if response . data is not None : wf . writeframes ( response . data ) # Un-comment this code to print audio data info # if response.server_content.model_turn is not None: # print(response.server_content.model_turn.parts[0].inline_data.mime_type) wf . close () if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; // npm install wavefile const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); // WARNING: Do not use API keys in \ No newline at end of file diff --git a/docstore/a4c67717-cadf-456c-9f7b-31ecbdeccc5f b/docstore/a4c67717-cadf-456c-9f7b-31ecbdeccc5f new file mode 100644 index 0000000000000000000000000000000000000000..9a3ae8e54d036eb9d08cf51953b4e3479c03ffae --- /dev/null +++ b/docstore/a4c67717-cadf-456c-9f7b-31ecbdeccc5f @@ -0,0 +1 @@ +Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/a4c7d41c-3227-4148-973f-2632d9f8498b b/docstore/a4c7d41c-3227-4148-973f-2632d9f8498b new file mode 100644 index 0000000000000000000000000000000000000000..9b0480f81615786d4da1611ac72ab96b5af43602 --- /dev/null +++ b/docstore/a4c7d41c-3227-4148-973f-2632d9f8498b @@ -0,0 +1 @@ +"CiIBVKhc7oDPpCaXyJKKssjqr4g3JNOSgJ/M2V+1THC1icsWCmwBVKhc7pBABbZ+zR3e9234WnWWS6GFXmf8IVwpnzjd5KYd7vyJbn/4vTorWBGayj/vbd9JPaZQjxdAIXhoE5mX/MDsQ7M9N/b0qJjHm39tYIBvS4sIWkMDHqTJqXGLzhhKtrTkfbV3RbaJEkQKmwEBVKhc7qVUgC3hfTXZLo9R3AJzUUIx50NKvJTb9B+UU+LBqgg7Nck1x5OpjWVS2R+SsveprIuYOruk2Y0H53J2OJF8qsxTdIq2si8DGW2V7WK8xyoJH5kbqd7drIw1jLb44b6lx4SMyB0VaULuTBki4d+Ljjg1tJTwR0IYMKqDLDZt9mheINsi0ZxcNjfpnDydRXdWbcSwzmK/wgqJAQFUqFzuKgNVElxs3cbO+xebr2IwcOro84nKTisi0tTp9bICPC9fTUhn3L+rvQWA+d3J1Za8at2bakrqiRj7BTh+CVO9fWQMAEQAs3ni0Z2hfaYG92tOD26E4IoZwyYEoWbfNudpH1fr5tEkyqnEGtWIh7H+XoZQ2DXeiOa+br7Zk88SrNE+trJMCogBAVSoXO5e9fBLg7hnbkmKsrzNLnQtLsQm1gNzjcjEC7nJYklYPp0KI2uGBE1PkM8XNsfllAfHVn7LzHcHNlbQ9pJ7QZTSIeG42goS971r5wNZwxaXwCTphClQh826eqJWo6A/28TtAVQWLhTx5ekbP7qb4nh1UblESZ1saxDQAEo4OKPbDzx5BgqKAQFUqFzuVyjNm5i0wN8hTDnKjfpDroEpPPTs531iFy9BOX+xDCdGHy8D+osFpaoBq6TFekQQbz4hIoUR1YEcP4zI80/cNimEeb9IcFxZTTxiNrbhbbcv0969DSMWhB+ZEqIz4vuw4GLe/xcUvqhlChQwFdgIbdOQHSHpatn5uDlktnP/bi26nKuXIwo0AVSoXO7US22OUH7d1f4abNPI0IyAvhqkPp12rbtWLx9vkOtojE8IP+xCfYtIFuZIzRNZqA==" } ] , "role" : "model" } , { "role" : "user" , "parts" : [ { "functionResponse" : { "name" : "getWeather" , "response" : { "response" : { "stringValue" : "Sunny and hot. 90 degrees Fahrenheit" } } } } ] } ] , # Remainder of request... Learn more about limitations and usage of thought signatures, and about thinking models in general, on the Thinking page. Parallel function calling In addition to single turn function calling, you can also call multiple functions at once. Parallel function calling lets you execute multiple functions at once and is used when the functions are not dependent on each other. This is useful in scenarios like gathering data from multiple independent sources, such as retrieving customer details from different databases or checking inventory levels across various warehouses or performing multiple actions such as converting your apartment into a disco. Python power_disco_ball = { "name" : "power_disco_ball" , "description" \ No newline at end of file diff --git a/docstore/a4d265a6-a0e7-4212-bcf1-6c8726092b51 b/docstore/a4d265a6-a0e7-4212-bcf1-6c8726092b51 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/a4d265a6-a0e7-4212-bcf1-6c8726092b51 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/a4e0bc01-49f7-479f-839a-3490458a7684 b/docstore/a4e0bc01-49f7-479f-839a-3490458a7684 new file mode 100644 index 0000000000000000000000000000000000000000..1b590529498df147a73d1ae4a22f439d08a36cb6 --- /dev/null +++ b/docstore/a4e0bc01-49f7-479f-839a-3490458a7684 @@ -0,0 +1 @@ +from "node:fs" ; const ai = new GoogleGenAI ({}); const base64ImageFile = fs . readFileSync ( "path/to/small-sample.jpg" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "image/jpeg" , data : base64ImageFile , }, }, { text : "Caption this image." }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go bytes , _ := os . ReadFile ( "path/to/small-sample.jpg" ) parts := [] * genai . Part { genai . NewPartFromBytes ( bytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST IMG_PATH = "/path/to/your/image1.jpg" if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "' " $( base64 $B64FLAGS $IMG_PATH ) " '" } }, {"text": "Caption this image."}, ] }] }' 2 > /dev/null You can also fetch an image from a URL, convert it to bytes, and pass it to generateContent as shown in the following examples. Python from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests . get ( image_path ) . content image = types . Part . from_bytes ( data = image_bytes , mime_type = "image/jpeg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is this image?" , image ], ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; \ No newline at end of file diff --git a/docstore/a4f3132e-86bd-41d9-a560-51adb62eb840 b/docstore/a4f3132e-86bd-41d9-a560-51adb62eb840 new file mode 100644 index 0000000000000000000000000000000000000000..1d5a02022906f295c3ad625acee2d3f5c63827ae --- /dev/null +++ b/docstore/a4f3132e-86bd-41d9-a560-51adb62eb840 @@ -0,0 +1 @@ +Aoede -- Breezy Callirrhoe -- Easy-going Autonoe -- Bright Enceladus -- Breathy Iapetus -- Clear Umbriel -- Easy-going Algieba -- Smooth Despina -- Smooth Erinome -- Clear Algenib -- Gravelly Rasalgethi -- Informative Laomedeia -- Upbeat Achernar -- Soft Alnilam -- Firm Schedar -- Even Gacrux -- Mature Pulcherrima -- Forward Achird -- Friendly Zubenelgenubi -- Casual Vindemiatrix -- Gentle Sadachbia -- Lively Sadaltager -- Knowledgeable Sulafat -- Warm You can hear all the voice options in AI Studio . Supported languages The TTS models detect the input language automatically. They support the following 24 languages: Language BCP-47 Code Language BCP-47 Code Arabic (Egyptian) ar-EG German (Germany) de-DE English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Korean (Korea) ko-KR Portuguese (Brazil) pt-BR Russian (Russia) ru-RU Dutch (Netherlands) nl-NL Polish (Poland) pl-PL Thai (Thailand) th-TH Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Romanian (Romania) ro-RO Ukrainian (Ukraine) uk-UA Bengali (Bangladesh) bn-BD English (India) en-IN & hi-IN bundle Marathi (India) mr-IN Tamil (India) ta-IN Telugu (India) te-IN Supported models Model Single speaker Multispeaker Gemini 2.5 Flash Preview TTS ✔️ ✔️ Gemini 2.5 Pro Preview TTS ✔️ ✔️ Limitations TTS models can only receive text inputs and generate audio outputs. A TTS session has a context window limit of 32k tokens. Review Languages section for language support. What's next Try the audio generation cookbook . Gemini's Live API offers interactive audio generation options you can interleave with other modalities. For working with audio inputs , visit the Audio understanding guide. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site \ No newline at end of file diff --git a/docstore/a51c60a1-2f9a-4adf-9c24-adccf548cedb b/docstore/a51c60a1-2f9a-4adf-9c24-adccf548cedb new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/a51c60a1-2f9a-4adf-9c24-adccf548cedb @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/a53557a3-ca52-4258-ac41-f0e735f5ff71 b/docstore/a53557a3-ca52-4258-ac41-f0e735f5ff71 new file mode 100644 index 0000000000000000000000000000000000000000..cc967ffee72bfb803a27c22b45a7304a203928ac --- /dev/null +++ b/docstore/a53557a3-ca52-4258-ac41-f0e735f5ff71 @@ -0,0 +1 @@ +the model to parse the problem before answering the prompt. Prompt Response Parse the formula in the image first. Then based on the formula, answer "what's the 4th term in the sequence?". Click to expand the result $$ b(n) = b(n-1) * (-3) $$ $$ b(1) = 15 $$ Substitute n = 4 into the formula: $$ b(4) = b(4-1) * (-3) $$ $$ b(4) = b(3) * (-3) $$ We don't know the value of b(3), so substitute n = 3 into the formula: $$ b(3) = b(3-1) * (-3) $$ $$ b(3) = b(2) * (-3) $$ We don't know the value of b(2), so substitute n = 2 into the formula: $$ b(2) = b(2-1) * (-3) $$ $$ b(2) = b(1) * (-3) $$ We know the value of b(1) is 15, so substitute it into the formula: $$ b(2) = 15 * (-3) $$ $$ b(2) = -45 $$ Now that we know the value of b(2), substitute it into the formula for b(3): $$ b(3) = b(2) * (-3) $$ $$ b(3) = -45 * (-3) $$ $$ b(3) = 135 $$ Now that we know the value of b(3), substitute it into the formula for b(4): $$ b(4) = b(3) * (-3) $$ $$ b(4) = 135 * (-3) $$ $$ b(4) = -405 $$ Therefore, the 4th term in the sequence is -405. This model response is correct - now to make the response even more streamlined, you can ask the model to to think step-by-step. Updated prompt Improved response What is the 4th term in the sequence? Think step by step. The first term is 15. The second term is b(1) * (-3) = 15 * (-3) = -45. The third term is b(2) * (-3) = -45 * (-3) = 135. The fourth term is b(3) * (-3) = 135 * (-3) = -405. Specify the output format A common problem is the need for model output to be in a certain format (e.g. markdown, JSON, HTML), especially if the model output needs to be ingested by a downstream task. You can try to produce output in that format by instructing the model to do so within the prompt. Below are two examples: Prompt Model response Parse the table in this image into markdown format | Subject | Status | Time | Start date | Due on | | ------- | ------ | ---- | ---------- | ------ | | Math | Done | 1.5 hrs | 11/29/2023 | 12/1/2023 | | Geography | Done | 30 \ No newline at end of file diff --git a/docstore/a54434ba-f372-4029-b07d-d077c26a90fc b/docstore/a54434ba-f372-4029-b07d-d077c26a90fc new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/a54434ba-f372-4029-b07d-d077c26a90fc @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/a5754164-4d4e-4958-935b-edb9066ccbf1 b/docstore/a5754164-4d4e-4958-935b-edb9066ccbf1 new file mode 100644 index 0000000000000000000000000000000000000000..964387d039ef80cfbc57b203b39f125290e566d4 --- /dev/null +++ b/docstore/a5754164-4d4e-4958-935b-edb9066ccbf1 @@ -0,0 +1 @@ +using the text below. Respond with only the text provided. Question: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Text: Color: Slowly pulsing yellow What it means: There is a network error. What to do: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. Color: Fast blinking yellow What it means: You are holding down the reset button and are factory resetting this device. What to do: If you keep holding down the reset button, after about 12 seconds, the light will turn solid yellow. Once it is solid yellow, let go of the factory reset button. Color: Solid yellow What it means: Router is factory resetting. What to do: This can take up to 10 minutes. When it's done, the device will reset itself and start pulsing white, letting you know it's ready for setup. Color: Solid red What it means: Something is wrong. What to do: Critical failure. Factory reset the router. If the light stays red, contact Wifi customer support. Response: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. (gemini-2.5-flash) Add prefixes A prefix is a word or phrase that you add to the prompt content that can serve several purposes, depending on where you put the prefix: Input prefix: Adding a prefix to the input signals semantically meaningful parts of the input to the model. For example, the prefixes "English:" and "French:" demarcate two different languages. Output prefix: Even though the output is generated by the model, you can add a prefix for the output in the prompt. The output prefix gives the model information about what's expected as a response. For example, the output prefix "JSON:" signals to the model that the output should be in JSON format. Example prefix: In few-shot prompts, adding prefixes \ No newline at end of file diff --git a/docstore/a5862d1f-40d8-4427-9b73-89d03517a93b b/docstore/a5862d1f-40d8-4427-9b73-89d03517a93b new file mode 100644 index 0000000000000000000000000000000000000000..f1254740842b565388da4397f822e545092a5b88 --- /dev/null +++ b/docstore/a5862d1f-40d8-4427-9b73-89d03517a93b @@ -0,0 +1 @@ +NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) stream := chat . SendMessageStream ( ctx , genai . Part { Text : "How many paws are in my house?" }) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt = sse \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Supported models All models in the Gemini family support text generation. To learn more about the models \ No newline at end of file diff --git a/docstore/a59bc8dd-3f7d-4cd2-abd2-1fcecb07fc28 b/docstore/a59bc8dd-3f7d-4cd2-abd2-1fcecb07fc28 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/a59bc8dd-3f7d-4cd2-abd2-1fcecb07fc28 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/a59c6295-df94-40ac-9b6c-741ab44b5512 b/docstore/a59c6295-df94-40ac-9b6c-741ab44b5512 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/a59c6295-df94-40ac-9b6c-741ab44b5512 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/a5a71f29-6175-47ef-8bd5-a32545fde67a b/docstore/a5a71f29-6175-47ef-8bd5-a32545fde67a new file mode 100644 index 0000000000000000000000000000000000000000..d4dbae9e7c7ca8b1cdfa318d76a5862ef0d88e3c --- /dev/null +++ b/docstore/a5a71f29-6175-47ef-8bd5-a32545fde67a @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/video#generate-from-images Title: Generate video using Veo | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/a5b15581-884a-4d76-9b0e-3b831ef246c7 b/docstore/a5b15581-884a-4d76-9b0e-3b831ef246c7 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/a5b15581-884a-4d76-9b0e-3b831ef246c7 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/a5bdca3f-a4ea-41f7-b42c-6a82e5fdc31d b/docstore/a5bdca3f-a4ea-41f7-b42c-6a82e5fdc31d new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/a5bdca3f-a4ea-41f7-b42c-6a82e5fdc31d @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/a5cafe48-4b5e-4472-a9e9-14c9bbbd7647 b/docstore/a5cafe48-4b5e-4472-a9e9-14c9bbbd7647 new file mode 100644 index 0000000000000000000000000000000000000000..f1beb73771dc026f597f2978b65f85f1da640824 --- /dev/null +++ b/docstore/a5cafe48-4b5e-4472-a9e9-14c9bbbd7647 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/structured-output#generating-enums Title: Structured output | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/a5e6d5d2-0119-4293-aa44-07ce86d09f7b b/docstore/a5e6d5d2-0119-4293-aa44-07ce86d09f7b new file mode 100644 index 0000000000000000000000000000000000000000..eb6db224edbdd160f04cb946308fd82587e98eec --- /dev/null +++ b/docstore/a5e6d5d2-0119-4293-aa44-07ce86d09f7b @@ -0,0 +1 @@ +marks Spain's record-breaking fourth European Championship title.[5]((https:/...), [2](https:/...), [3](https:/...), [4](https:/...) Pricing When you use Grounding with Google Search, your project is billed per API request that includes the google_search tool. If the model decides to execute multiple search queries to answer a single prompt (for example, searching for "UEFA Euro 2024 winner" and "Spain vs England Euro 2024 final score" within the same API call), this counts as a single billable use of the tool for that request. For detailed pricing information, see the Gemini API pricing page . Supported Models Experimental and Preview models are not included. You can find their capabilities on the model overview page. Model Grounding with Google Search Gemini 2.5 Pro ✔️ Gemini 2.5 Flash ✔️ Gemini 2.0 Flash ✔️ Gemini 1.5 Pro ✔️ Gemini 1.5 Flash ✔️ Note: Older models use a google_search_retrieval tool. For all current models, use the google_search tool as shown in the examples. Grounding with Gemini 1.5 Models (Legacy) While the google_search tool is recommended for Gemini 2.0 and later, Gemini 1.5 support a legacy tool named google_search_retrieval . This tool provides a dynamic mode that allows the model to decide whether to perform a search based on its confidence that the prompt requires fresh information. If the model's confidence is above a dynamic_threshold you set (a value between 0.0 and 1.0), it will perform a search. Python # Note: This is a legacy approach for Gemini 1.5 models. # The 'google_search' tool is recommended for all new development. import os from google import genai from google.genai import types client = genai . Client () retrieval_tool = types . Tool ( google_search_retrieval = types . GoogleSearchRetrieval ( dynamic_retrieval_config = types . DynamicRetrievalConfig ( mode = types . DynamicRetrievalConfigMode . MODE_DYNAMIC , dynamic_threshold = 0.7 # Only search if confidence > 70% ) ) ) config = types . GenerateContentConfig ( tools = [ \ No newline at end of file diff --git a/docstore/a5e81d2e-795b-40a1-9b34-b2bea922f656 b/docstore/a5e81d2e-795b-40a1-9b34-b2bea922f656 new file mode 100644 index 0000000000000000000000000000000000000000..771c2c741948f29f5c3605e7090d7f1d54bfcf1f --- /dev/null +++ b/docstore/a5e81d2e-795b-40a1-9b34-b2bea922f656 @@ -0,0 +1 @@ +For example, assume that you're developing an application to classify musical instruments into one of five categories: "Percussion" , "String" , "Woodwind" , "Brass" , or " "Keyboard" ". You could create an enum to help with this task. In the following example, you pass an enum as the responseSchema , constraining the model to choose the most appropriate option. Python from google import genai import enum class Instrument ( enum . Enum ): PERCUSSION = "Percussion" STRING = "String" WOODWIND = "Woodwind" BRASS = "Brass" KEYBOARD = "Keyboard" client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = 'What type of instrument is an oboe?' , config = { 'response_mime_type' : 'text/x.enum' , 'response_schema' : Instrument , }, ) print ( response . text ) # Woodwind JavaScript import { GoogleGenAI , Type } from "@google/genai" ; const ai = new GoogleGenAI ({}); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "What type of instrument is an oboe?" , config : { responseMimeType : "text/x.enum" , responseSchema : { type : Type . STRING , enum : [ "Percussion" , "String" , "Woodwind" , "Brass" , "Keyboard" ], }, }, }); console . log ( response . text ); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "contents": [{ "parts":[ { "text": "What type of instrument is an oboe?" } ] }], "generationConfig": { "responseMimeType": "text/x.enum", "responseSchema": { "type": "STRING", "enum": ["Percussion", "String", "Woodwind", "Brass", "Keyboard"] } } }' The Python library will translate the type declarations for the API. However, the API accepts a subset of the OpenAPI 3.0 schema ( Schema ). There are two other ways to specify an enumeration. You can use a Literal : ``` Python Literal [ "Percussion" , "String" , "Woodwind" , "Brass" , "Keyboard" ] \ No newline at end of file diff --git a/docstore/a5fd4bfa-b238-4c8c-9ee6-252bdb0f5009 b/docstore/a5fd4bfa-b238-4c8c-9ee6-252bdb0f5009 new file mode 100644 index 0000000000000000000000000000000000000000..71a40ddf8f5869779c79b3a864f86f411dc17d60 --- /dev/null +++ b/docstore/a5fd4bfa-b238-4c8c-9ee6-252bdb0f5009 @@ -0,0 +1 @@ +correct reasoning steps afterward. To disambiguate between those reasons, ask the model to describe what's in the image. In the following example, if the model responds with a snack that seems surprising when paired with tea (e.g. popcorn), you can first troubleshoot to determine whether the model correctly recognized that the image contains tea. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? Describe what's in this image. Another strategy is to ask the model to explain its reasoning. That can help you narrow down which part of the reasoning broke down, if any. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? What's a snack I can make in 1 minute that would go well with this? Please explain why. What's next Try writing your own multimodal prompts using Google AI Studio . For information on using the Gemini Files API for uploading media files and including them in your prompts, see the Vision , Audio , and Document processing guides. For more guidance on prompt design, like tuning sampling parameters, see the Prompt strategies page. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/a5ff737e-39d3-4ab7-9e7e-858638041270 b/docstore/a5ff737e-39d3-4ab7-9e7e-858638041270 new file mode 100644 index 0000000000000000000000000000000000000000..76b3241c42effba70a7ac847bf09514ad7de11cd --- /dev/null +++ b/docstore/a5ff737e-39d3-4ab7-9e7e-858638041270 @@ -0,0 +1 @@ +not specified, the model can choose from any of the provided function declarations. If allowed_function_names is provided as a list, the model can only choose from the functions in that list. Use this mode when you require a function call response to every prompt (if applicable). NONE : The model is prohibited from making function calls. This is equivalent to sending a request without any function declarations. Use this to temporarily disable function calling without removing your tool definitions. Python from google.genai import types # Configure function calling mode tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = "ANY" , allowed_function_names = [ "get_current_temperature" ] ) ) # Create the generation config config = types . GenerateContentConfig ( tools = [ tools ], # not defined here. tool_config = tool_config , ) JavaScript import { FunctionCallingConfigMode } from '@google/genai' ; // Configure function calling mode const toolConfig = { functionCallingConfig : { mode : FunctionCallingConfigMode . ANY , allowedFunctionNames : [ 'get_current_temperature' ] } }; // Create the generation config const config = { tools : tools , // not defined here. toolConfig : toolConfig , }; Automatic function calling (Python only) When using the Python SDK, you can provide Python functions directly as tools. The SDK automatically converts the Python function to declarations, handles the function call execution and the response cycle for you. The Python SDK then automatically: Detects function call responses from the model. Call the corresponding Python function in your code. Sends the function response back to the model. Returns the model's final text response. To use this, define your function with type hints and a docstring, and then pass the function itself (not a JSON declaration) as a tool: Python from google import genai from google.genai import types # Define the function with type hints and docstring def \ No newline at end of file diff --git a/docstore/a6046451-8cd6-4710-a9f1-0ffc85c722f8 b/docstore/a6046451-8cd6-4710-a9f1-0ffc85c722f8 new file mode 100644 index 0000000000000000000000000000000000000000..7ad07eb45fff1ffd88928a8c1191c40c43412859 --- /dev/null +++ b/docstore/a6046451-8cd6-4710-a9f1-0ffc85c722f8 @@ -0,0 +1 @@ +public domain and does not show identifiable people. ( NASA image and media usage guidelines. ) The following code downloads the sample video, uploads it using the File API, waits for it to be processed, and then uses the file reference in a generateContent request. Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp4" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ myfile , "Summarize this video. Then create a quiz with an answer key based on the information in this video." ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp4" , config : { mimeType : "video/mp4" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Summarize this video. Then create a quiz with an answer key based on the information in this video." , ]), }); console . log ( response . text ); } await main (); Go uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.mp4" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Summarize this video. Then create a quiz with an answer key based on the information in this video." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST VIDEO_PATH = "path/to/sample.mp4" MIME_TYPE = $( file -b --mime-type " ${ VIDEO_PATH } " ) NUM_BYTES = $( wc -c < " ${ VIDEO_PATH } " ) DISPLAY_NAME = VIDEO tmp_header_file = upload-header.tmp echo "Starting file \ No newline at end of file diff --git a/docstore/a6194896-4704-46ff-8397-bd07512b108c b/docstore/a6194896-4704-46ff-8397-bd07512b108c new file mode 100644 index 0000000000000000000000000000000000000000..8af7573f633bc8337efd3a0ab87cdc8a90abf578 --- /dev/null +++ b/docstore/a6194896-4704-46ff-8397-bd07512b108c @@ -0,0 +1 @@ +Studio Our fastest multimodal model with great performance for diverse, repetitive tasks and a 1 million token context window. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge $0.075, prompts <= 128k tokens $0.15, prompts > 128k tokens Output price Free of charge $0.30, prompts <= 128k tokens $0.60, prompts > 128k tokens Context caching price Free of charge, up to 1 million tokens of storage per hour $0.01875, prompts <= 128k tokens $0.0375, prompts > 128k tokens Context caching (storage) Free of charge $1.00 per hour Tuning price Token prices are the same for tuned models Tuning service is free of charge. Token prices are the same for tuned models Tuning service is free of charge. Grounding with Google Search Not available $35 / 1K grounding requests Used to improve our products Yes No Gemini 1.5 Flash-8B Try it in Google AI Studio Our smallest model for lower intelligence use cases, with a 1 million token context window. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge $0.0375, prompts <= 128k tokens $0.075, prompts > 128k tokens Output price Free of charge $0.15, prompts <= 128k tokens $0.30, prompts > 128k tokens Context caching price Free of charge, up to 1 million tokens of storage per hour $0.01, prompts <= 128k tokens $0.02, prompts > 128k tokens Context caching (storage) Free of charge $0.25 per hour Tuning price Token prices are the same for tuned models Tuning service is free of charge. Token prices are the same for tuned models Tuning service is free of charge. Grounding with Google Search Not available $35 / 1K grounding requests Used to improve our products Yes No Gemini 1.5 Pro Try it in Google AI Studio Our highest intelligence Gemini 1.5 series model, with a breakthrough 2 million token context window. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge $1.25, prompts <= 128k tokens $2.50, prompts > 128k tokens Output price Free of charge $5.00, prompts <= 128k tokens $10.00, prompts > 128k \ No newline at end of file diff --git a/docstore/a625bf69-8f04-4764-bc18-563fbda2f0cb b/docstore/a625bf69-8f04-4764-bc18-563fbda2f0cb new file mode 100644 index 0000000000000000000000000000000000000000..53e5ed0c4b3c9d5f8d129df24753928921198efa --- /dev/null +++ b/docstore/a625bf69-8f04-4764-bc18-563fbda2f0cb @@ -0,0 +1 @@ +text ); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -X POST \ -d '{ "contents": [ { "parts": [ {"text": "Who won the euro 2024?"} ] } ], "tools": [ { "google_search": {} } ] }' You can learn more by trying the Search tool notebook . How grounding with Google Search works When you enable the google_search tool, the model handles the entire workflow of searching, processing, and citing information automatically. User Prompt: Your application sends a user's prompt to the Gemini API with the google_search tool enabled. Prompt Analysis: The model analyzes the prompt and determines if a Google Search can improve the answer. Google Search: If needed, the model automatically generates one or multiple search queries and executes them. Search Results Processing: The model processes the search results, synthesizes the information, and formulates a response. Grounded Response: The API returns a final, user-friendly response that is grounded in the search results. This response includes the model's text answer and groundingMetadata with the search queries, web results, and citations. Understanding the Grounding Response When a response is successfully grounded, the response includes a groundingMetadata field. This structured data is essential for verifying claims and building a rich citation experience in your application. { "candidates" : [ { "content" : { "parts" : [ { "text" : "Spain won Euro 2024, defeating England 2-1 in the final. This victory marks Spain's record fourth European Championship title." } ], "role" : "model" }, "groundingMetadata" : { "webSearchQueries" : [ "UEFA Euro 2024 winner" , "who won euro 2024" ], "searchEntryPoint" : { "renderedContent" : "" }, "groundingChunks" : [ { "web" : { "uri" : "https://vertexaisearch.cloud.google.com....." , "title" : "aljazeera.com" }}, { "web" : \ No newline at end of file diff --git a/docstore/a636abc6-ca17-4d37-b6dc-34360d3d77fd b/docstore/a636abc6-ca17-4d37-b6dc-34360d3d77fd new file mode 100644 index 0000000000000000000000000000000000000000..007f635c2d64ef42082bdbd7a31da59c854333a3 --- /dev/null +++ b/docstore/a636abc6-ca17-4d37-b6dc-34360d3d77fd @@ -0,0 +1 @@ +SILENT } ) JavaScript import { GoogleGenAI , Modality , Behavior , FunctionResponseScheduling } from '@google/genai' ; // for a non-blocking function definition, apply scheduling in the function response: const functionResponse = { id : fc . id , name : fc . name , response : { result : "ok" , scheduling : FunctionResponseScheduling . INTERRUPT // Can also be WHEN_IDLE or SILENT } } Code execution You can define code execution as part of the session configuration. This lets the Live API generate and execute Python code and dynamically perform computations to benefit your results. See the Code execution tutorial to learn more. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" tools = [{ 'code_execution' : {}}] config = { "response_modalities" : [ "TEXT" ], "tools" : tools } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : prompt = "Compute the largest prime palindrome under 100000." await session . send_client_content ( turns = { "parts" : [{ "text" : prompt }]}) async for chunk in session . receive (): if chunk . server_content : if chunk . text is not None : print ( chunk . text ) model_turn = chunk . server_content . model_turn if model_turn : for part in model_turn . parts : if part . executable_code is not None : print ( part . executable_code . code ) if part . code_execution_result is not None : print ( part . code_execution_result . output ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const tools = [{ codeExecution : {}}] const config = { responseModalities : [ Modality . TEXT ], tools : tools } async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = \ No newline at end of file diff --git a/docstore/a63b4def-06bd-47ea-b6cd-fbba6e7bd3cb b/docstore/a63b4def-06bd-47ea-b6cd-fbba6e7bd3cb new file mode 100644 index 0000000000000000000000000000000000000000..b362fdd58ed7301c466f0b3a048e65a061fc1b90 --- /dev/null +++ b/docstore/a63b4def-06bd-47ea-b6cd-fbba6e7bd3cb @@ -0,0 +1 @@ +"messages": [ {"role": "user", "content": "Explain to me how AI works"} ] }' Gemini thinking models also produce thought summaries and can use exact thinking budgets . You can use the extra_body field to include these fields in your request. Note that reasoning_effort and thinking_budget overlap functionality, so they can't be used at the same time. Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.5-flash" , messages = [{ "role" : "user" , "content" : "Explain to me how AI works" }], extra_body = { 'extra_body' : { "google" : { "thinking_config" : { "thinking_budget" : 800 , "include_thoughts" : True } } } } ) print ( response . choices [ 0 ] . message ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); const response = await openai . chat . completions . create ({ model : "gemini-2.5-flash" , messages : [{ role : "user" , content : "Explain to me how AI works" ,}], extra_body : { "google" : { "thinking_config" : { "thinking_budget" : 800 , "include_thoughts" : true } } } }); console . log ( response . choices [ 0 ]. message ); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.5-flash", "messages": [{"role": "user", "content": "Explain to me how AI works"}], "extra_body": { "google": { "thinking_config": { "include_thoughts": true } } } }' Streaming The Gemini API supports streaming responses . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = [ { \ No newline at end of file diff --git a/docstore/a64380ca-6877-4011-a91b-0ad1d4b5051c b/docstore/a64380ca-6877-4011-a91b-0ad1d4b5051c new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/a64380ca-6877-4011-a91b-0ad1d4b5051c @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/a64c4bdf-9cdc-4547-9e04-5025d5bd1f4c b/docstore/a64c4bdf-9cdc-4547-9e04-5025d5bd1f4c new file mode 100644 index 0000000000000000000000000000000000000000..1b590529498df147a73d1ae4a22f439d08a36cb6 --- /dev/null +++ b/docstore/a64c4bdf-9cdc-4547-9e04-5025d5bd1f4c @@ -0,0 +1 @@ +from "node:fs" ; const ai = new GoogleGenAI ({}); const base64ImageFile = fs . readFileSync ( "path/to/small-sample.jpg" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "image/jpeg" , data : base64ImageFile , }, }, { text : "Caption this image." }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go bytes , _ := os . ReadFile ( "path/to/small-sample.jpg" ) parts := [] * genai . Part { genai . NewPartFromBytes ( bytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST IMG_PATH = "/path/to/your/image1.jpg" if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "' " $( base64 $B64FLAGS $IMG_PATH ) " '" } }, {"text": "Caption this image."}, ] }] }' 2 > /dev/null You can also fetch an image from a URL, convert it to bytes, and pass it to generateContent as shown in the following examples. Python from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests . get ( image_path ) . content image = types . Part . from_bytes ( data = image_bytes , mime_type = "image/jpeg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is this image?" , image ], ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; \ No newline at end of file diff --git a/docstore/a6521fa5-f672-4049-b79f-d3d62097bc6e b/docstore/a6521fa5-f672-4049-b79f-d3d62097bc6e new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/a6521fa5-f672-4049-b79f-d3d62097bc6e @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/a65bf61b-fedd-4ba7-8495-01845583a672 b/docstore/a65bf61b-fedd-4ba7-8495-01845583a672 new file mode 100644 index 0000000000000000000000000000000000000000..465170b0f695eefaa56d7f3871f4f3df2424da62 --- /dev/null +++ b/docstore/a65bf61b-fedd-4ba7-8495-01845583a672 @@ -0,0 +1 @@ +brightness color_temp: Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. Returns: A dictionary containing the set brightness and color temperature. """ return { "brightness" : brightness , "colorTemperature" : color_temp } JavaScript import { Type } from '@google/genai' ; // Define a function that the model can call to control smart lights const setLightValuesFunctionDeclaration = { name : 'set_light_values' , description : 'Sets the brightness and color temperature of a light.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'Light level from 0 to 100. Zero is off and 100 is full brightness' , }, color_temp : { type : Type . STRING , enum : [ 'daylight' , 'cool' , 'warm' ], description : 'Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`.' , }, }, required : [ 'brightness' , 'color_temp' ], }, }; /** * Set the brightness and color temperature of a room light. (mock API) * @param {number} brightness - Light level from 0 to 100. Zero is off and 100 is full brightness * @param {string} color_temp - Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. * @return {Object} A dictionary containing the set brightness and color temperature. */ function setLightValues ( brightness , color_temp ) { return { brightness : brightness , colorTemperature : color_temp }; } Step 2: Call the model with function declarations Once you have defined your function declarations, you can prompt the model to use them. It analyzes the prompt and function declarations and decides whether to respond directly or to call a function. If a function is called, the response object will contain a function call suggestion. Python from google.genai import types # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ set_light_values_declaration ]) config = types . GenerateContentConfig ( tools = [ \ No newline at end of file diff --git a/docstore/a6770ce7-b98c-433c-89e0-b98e9228c023 b/docstore/a6770ce7-b98c-433c-89e0-b98e9228c023 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/a6770ce7-b98c-433c-89e0-b98e9228c023 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/a68133d8-2284-4abe-a36c-c04aeebe0588 b/docstore/a68133d8-2284-4abe-a36c-c04aeebe0588 new file mode 100644 index 0000000000000000000000000000000000000000..8759a03a20a3177c7734cd1638fb9c60e8d9d57e --- /dev/null +++ b/docstore/a68133d8-2284-4abe-a36c-c04aeebe0588 @@ -0,0 +1 @@ +popularized by short form video apps (for example, YouTube shorts). Use this for tall objects with strong vertical orientations such as buildings, trees, waterfalls, or other similar objects. Prompt: a digital render of a massive skyscraper, modern, grand, epic with a beautiful sunset in the background (9:16 aspect ratio) Photorealistic images Different versions of the image generation model might offer a mix of artistic and photorealistic output. Use the following wording in prompts to generate more photorealistic output, based on the subject you want to generate. Note: Take these keywords as general guidance when you try to create photorealistic images. They aren't required to achieve your goal. Use case Lens type Focal lengths Additional details People (portraits) Prime, zoom 24-35mm black and white film, Film noir, Depth of field, duotone (mention two colors) Food, insects, plants (objects, still life) Macro 60-105mm High detail, precise focusing, controlled lighting Sports, wildlife (motion) Telephoto zoom 100-400mm Fast shutter speed, Action or movement tracking Astronomical, landscape (wide-angle) Wide-angle 10-24mm Long exposure times, sharp focus, long exposure, smooth water or clouds Portraits Use case Lens type Focal lengths Additional details People (portraits) Prime, zoom 24-35mm black and white film, Film noir, Depth of field, duotone (mention two colors) Using several keywords from the table, Imagen can generate the following portraits: Prompt: A woman, 35mm portrait, blue and grey duotones Model: imagen-3.0-generate-002 Prompt: A woman, 35mm portrait, film noir Model: imagen-3.0-generate-002 Objects Use case Lens type Focal lengths Additional details Food, insects, plants (objects, still life) Macro 60-105mm High detail, precise focusing, controlled lighting Using several keywords from the table, Imagen can generate the following object images: Prompt: leaf of a prayer plant, macro lens, 60mm Model: imagen-3.0-generate-002 Prompt: a plate of pasta, \ No newline at end of file diff --git a/docstore/a68af663-4930-4139-880a-1d73c30e9805 b/docstore/a68af663-4930-4139-880a-1d73c30e9805 new file mode 100644 index 0000000000000000000000000000000000000000..4a588ba82271aaa08f2df12295049c23bd57f7eb --- /dev/null +++ b/docstore/a68af663-4930-4139-880a-1d73c30e9805 @@ -0,0 +1 @@ +, ) print ( response . text ) # The SDK handles the function call and returns the final text You can disable automatic function calling with: Python config = types . GenerateContentConfig ( tools = [ get_current_temperature ], automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ) ) Automatic function schema declaration Automatic schema extraction from Python functions doesn't work in all cases. For example, it doesn't handle cases where you describe the fields of a nested dictionary-object. The API is able to describe any of the following types: Python AllowedType = ( int | float | bool | str | list [ 'AllowedType' ] | dict [ str , AllowedType ]) To see what the inferred schema looks like, you can convert it using from_callable : Python def multiply ( a : float , b : float ): """Returns a * b.""" return a * b fn_decl = types . FunctionDeclaration . from_callable ( callable = multiply , client = client ) # to_json_dict() provides a clean JSON representation. print ( fn_decl . to_json_dict ()) Multi-tool use: Combine native tools with function calling You can enable multiple tools combining native tools with function calling at the same time. Here's an example that enables two tools, Grounding with Google Search and code execution , in a request using the Live API . Note: Multi-tool use is a- Live API only feature at the moment. The run() function declaration, which handles the asynchronous websocket setup, is omitted for brevity. Python # Multiple tasks example - combining lights, code execution, and search prompt = """ Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! """ tools = [ { 'google_search' : {}}, { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} # not defined here. \ No newline at end of file diff --git a/docstore/a68e9e66-cf9e-4fbb-9e80-6c72a845e71a b/docstore/a68e9e66-cf9e-4fbb-9e80-6c72a845e71a new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/a68e9e66-cf9e-4fbb-9e80-6c72a845e71a @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/a6a8fdb1-aae3-4bca-9288-d856cf4cadd8 b/docstore/a6a8fdb1-aae3-4bca-9288-d856cf4cadd8 new file mode 100644 index 0000000000000000000000000000000000000000..a93f111d87cca3d0226fc4cdf96775703aa03950 --- /dev/null +++ b/docstore/a6a8fdb1-aae3-4bca-9288-d856cf4cadd8 @@ -0,0 +1 @@ +genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMAGE_PATH = "path/to/sample.jpg" MIME_TYPE = $( file -b --mime-type " ${ IMAGE_PATH } " ) NUM_BYTES = $( wc -c < " ${ IMAGE_PATH } " ) DISPLAY_NAME = IMAGE tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Caption this image."}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Prompting with multiple images You can provide multiple images in a \ No newline at end of file diff --git a/docstore/a6b838d3-0a73-4b69-a77f-b68debed0e7f b/docstore/a6b838d3-0a73-4b69-a77f-b68debed0e7f new file mode 100644 index 0000000000000000000000000000000000000000..58a0a85d29a664bbdbdb8d3a93c58460ef8ef554 --- /dev/null +++ b/docstore/a6b838d3-0a73-4b69-a77f-b68debed0e7f @@ -0,0 +1 @@ += []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); And here is a text-to-audio example. You can receive audio by setting AUDIO as response modality. This example saves the received data as WAV file: \ No newline at end of file diff --git a/docstore/a6d01e6c-9df5-4c86-83c8-6e7966f7a8fd b/docstore/a6d01e6c-9df5-4c86-83c8-6e7966f7a8fd new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/a6d01e6c-9df5-4c86-83c8-6e7966f7a8fd @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/a6d95615-eab0-46e3-8320-7d9600142fc9 b/docstore/a6d95615-eab0-46e3-8320-7d9600142fc9 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/a6d95615-eab0-46e3-8320-7d9600142fc9 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/a6e076ff-5dfa-4a9a-baf0-83ba124974fd b/docstore/a6e076ff-5dfa-4a9a-baf0-83ba124974fd new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/a6e076ff-5dfa-4a9a-baf0-83ba124974fd @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/a6ea6e37-6c2a-46f2-8b9d-0c8309beb32f b/docstore/a6ea6e37-6c2a-46f2-8b9d-0c8309beb32f new file mode 100644 index 0000000000000000000000000000000000000000..1540f812946831d7975c774be202eaa83cf52504 --- /dev/null +++ b/docstore/a6ea6e37-6c2a-46f2-8b9d-0c8309beb32f @@ -0,0 +1 @@ +"CiwBVKhc7nRyTi3HmggPD9iQiRc261f5jwuMdw3H/itDH0emsb9ZVo3Nwx9p6wpsAVSoXO5i8fDV4jBSBLoaWxB5zUdlGY6aIGp+I0oEnwRRSRQ1LOvrDlojEH8JE8HjiKXALdJrvNPiG+HY3GZEO8pZjEZtc3UoBUh7+SVyjK7Xolu7aRYYeUyzrCapoETWypER1jbrJXnFV23hCosBAVSoXO6oIPNJSmbuEDfGafOhuCSHkpr1yjTp35RXYqmCESzRzWf5+nFXLqncqeFo4ohoxbiYQVpVQbOZF81p8o9zg6xeRE7qMeOv+XN7enXGJ4/s3qNFQpfkSMqRdBITN1VpX7jyfEAjvxBNc7PDfDJZmEPY338ZIY5nFFcmzJSWjVrboFt2sMFv+A==" } ] , "role" : "model" } , "finishReason" : "STOP" , "index" : 0 } ] , # Remainder of response... You can confirm that you received a signature and see what a signature looks like using the following code: # Step 2: Call the model with function declarations # ...Generation config, Configure the client, and Define user prompt (No changes) # Send request with declarations (using a thinking model) response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents ) # See thought signatures for part in response . candidates [ 0 ] . content . parts : if part . thought_signature : print ( "Thought signature:" ) print ( part . thought_signature ) Returning signatures back to the server In order to return signatures back: You should return signatures along with their containing parts back to the server You shouldn't merge a part with a signature with another part which also contains a signature. The signature string is not concatenable You shouldn't merge one part with a signature with another part without a signature. This breaks the correct positioning of the thought represented by the signature. The code will remain the same as in Step 4 of the previous section. But in this case (as indicated in the comment below) you will return signatures to the model along with the result of the function execution so the model can incorporate the thoughts into its final response: Python # Step 4: Create user friendly response with function result and call the model again # ...Create a function response part (No change) # Append thought \ No newline at end of file diff --git a/docstore/a709007d-d9e3-4119-a624-b16c893d1aab b/docstore/a709007d-d9e3-4119-a624-b16c893d1aab new file mode 100644 index 0000000000000000000000000000000000000000..933572d214295ba3502f9da18dc8703a97691ed7 --- /dev/null +++ b/docstore/a709007d-d9e3-4119-a624-b16c893d1aab @@ -0,0 +1 @@ +captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). * Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds, and their general characteristics. * Thesis Statement: State the main point of the essay, focusing on the remarkable adaptations, behaviors, and ecological significance that make hummingbirds extraordinary. ..... (gemini-2.5-flash) Zero-shot vs few-shot prompts You can include examples in the prompt that show the model what getting it right looks like. The model attempts to identify patterns and relationships from the examples and applies them when generating a response. Prompts that contain a few examples are called few-shot prompts, while prompts that provide no examples are called zero-shot prompts. Few-shot prompts are often used to regulate the formatting, phrasing, scoping, or general patterning of model responses. Use specific and varied examples to help the model narrow its focus and generate more accurate results. We recommend to always include few-shot examples in your prompts. Prompts without few-shot examples are likely to be less effective. In fact, you can remove instructions from your prompt if your examples are clear enough in showing the task at hand. The following zero-shot prompt asks the model to choose the best explanation. Prompt: Please choose the best explanation to the question: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Explanation1 is the better explanation because it provides more detail on the process, including how ice crystals combine and grow into snowflakes as they fall through the atmosphere. (gemini-2.5-flash) If your use case requires the \ No newline at end of file diff --git a/docstore/a719d2cb-ed1f-4c0b-bed3-88b1ca72dda9 b/docstore/a719d2cb-ed1f-4c0b-bed3-88b1ca72dda9 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/a719d2cb-ed1f-4c0b-bed3-88b1ca72dda9 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/a725a3b4-2a76-4b53-860e-418f13c67b08 b/docstore/a725a3b4-2a76-4b53-860e-418f13c67b08 new file mode 100644 index 0000000000000000000000000000000000000000..6ba79c2c7031e707d89e2b23470a5d9ee375cc5c --- /dev/null +++ b/docstore/a725a3b4-2a76-4b53-860e-418f13c67b08 @@ -0,0 +1 @@ +mcp.client.stdio import stdio_client from google import genai client = genai . Client () # Create server parameters for stdio connection server_params = StdioServerParameters ( command = "npx" , # Executable args = [ "-y" , "@philschmid/weather-mcp" ], # MCP Server env = None , # Optional environment variables ) async def run (): async with stdio_client ( server_params ) as ( read , write ): async with ClientSession ( read , write ) as session : # Prompt to get the weather for the current day in London. prompt = f "What is the weather in London in { datetime . now () . strftime ( '%Y-%m- %d ' ) } ?" # Initialize the connection between client and server await session . initialize () # Send request to the model with MCP function declarations response = await client . aio . models . generate_content ( model = "gemini-2.5-flash" , contents = prompt , config = genai . types . GenerateContentConfig ( temperature = 0 , tools = [ session ], # uses the session, will automatically call the tool # Uncomment if you **don't** want the SDK to automatically call the tool # automatic_function_calling=genai.types.AutomaticFunctionCallingConfig( # disable=True # ), ), ) print ( response . text ) # Start the asyncio event loop and run the main function asyncio . run ( run ()) JavaScript Make sure the latest version of the mcp SDK is installed on your platform of choice. npm install @modelcontextprotocol/sdk Note: JavaScript supports automatic tool calling by wrapping the client with mcpToTool . If you want to disable it, you can provide automaticFunctionCalling with disabled true . import { GoogleGenAI , FunctionCallingConfigMode , mcpToTool } from '@google/genai' ; import { Client } from "@modelcontextprotocol/sdk/client/index.js" ; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js" ; // Create server parameters for stdio connection const serverParams = new StdioClientTransport ({ command : "npx" , // Executable args : [ "-y" , "@philschmid/weather-mcp" \ No newline at end of file diff --git a/docstore/a7321124-9dbc-452b-8d6c-dec451ac5fc3 b/docstore/a7321124-9dbc-452b-8d6c-dec451ac5fc3 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/a7321124-9dbc-452b-8d6c-dec451ac5fc3 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/a7363842-2986-4adc-87e8-c7d9a35283b7 b/docstore/a7363842-2986-4adc-87e8-c7d9a35283b7 new file mode 100644 index 0000000000000000000000000000000000000000..5a67c041917cdaf904b0e03794a07af474503a9a --- /dev/null +++ b/docstore/a7363842-2986-4adc-87e8-c7d9a35283b7 @@ -0,0 +1 @@ +upload..." curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D ${ tmp_header_file } \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " echo "Uploading video data..." curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ VIDEO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri echo "File uploaded successfully. File URI: ${ file_uri } " # --- 3. Generate content using the uploaded video file --- echo "Generating content from video..." curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Summarize this video. Then create a quiz with an answer key based on the information in this video."}] }] }' 2 > /dev/null > response.json jq -r ".candidates[].content.parts[].text" response.json To learn more about working with media files, see Files API . Pass video data inline Instead of uploading a video file using the File API, you can pass smaller videos directly in the request to generateContent . This is suitable for shorter videos under 20MB total request size. Here's an example of providing inline video data: Python # Only for videos of size <20Mb video_file_name = "/path/to/your/video.mp4" video_bytes = \ No newline at end of file diff --git a/docstore/a749915c-fbc5-4e2c-875d-30ad32ec1d5d b/docstore/a749915c-fbc5-4e2c-875d-30ad32ec1d5d new file mode 100644 index 0000000000000000000000000000000000000000..0b9f6d133a3c8fd1d2ce9eec58555814c092074d --- /dev/null +++ b/docstore/a749915c-fbc5-4e2c-875d-30ad32ec1d5d @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key= $GOOGLE_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"inline_data": {"mime_type": "application/pdf", "data": "' " $ENCODED_PDF " '"}}, {"text": "' $PROMPT '"} ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json # Clean up the downloaded PDF rm " ${ DISPLAY_NAME } .pdf" You can also read a PDF from a local file for processing: Python from google import genai from google.genai import types import pathlib client = genai . Client () # Retrieve and encode the PDF byte filepath = pathlib . Path ( 'file.pdf' ) prompt = "Summarize this document" response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ types . Part . from_bytes ( data = filepath . read_bytes (), mime_type = 'application/pdf' , ), prompt ]) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from 'fs' ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); async function main () { const contents = [ { text : "Summarize this document" }, { inlineData : { mimeType : 'application/pdf' , data : Buffer . from ( fs . readFileSync ( "content/343019_3_art_0_py4t4l_convrt.pdf" )). toString ( "base64" ) } } ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , _ := genai . NewClient ( ctx , & genai . ClientConfig { APIKey : os . Getenv ( "GEMINI_API_KEY" ), Backend : genai . BackendGeminiAPI , }) pdfBytes , _ := os . ReadFile ( "path/to/your/file.pdf" ) parts := [] * genai . Part { & genai . Part { InlineData : & genai . Blob { MIMEType : "application/pdf" , Data : pdfBytes , }, }, genai . NewPartFromText \ No newline at end of file diff --git a/docstore/a753a955-4b1a-44df-87fb-a51a32bdd41b b/docstore/a753a955-4b1a-44df-87fb-a51a32bdd41b new file mode 100644 index 0000000000000000000000000000000000000000..ca0d44e9961a5c85cb8aeb7b443a141c9784fb0d --- /dev/null +++ b/docstore/a753a955-4b1a-44df-87fb-a51a32bdd41b @@ -0,0 +1 @@ +Image understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image understanding Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models. Tip: In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer improved accuracy for specific use cases like object detection and segmentation , through additional training. See the Capabilities section for more details. Passing images to Gemini You can provide images as input to Gemini using two methods: Passing inline image data : Ideal for smaller files (total request size less than 20MB, including prompts). Uploading images using the File API : Recommended for larger files or for reusing images across multiple requests. Passing inline image data You can pass inline image data in the request to generateContent . You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language). The following example shows how to read an image from a local file and pass it to generateContent API for processing. Python from google.genai import types with open ( 'path/to/small-sample.jpg' , 'rb' ) as f : image_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ types . Part . from_bytes ( data = image_bytes , mime_type = 'image/jpeg' , ), 'Caption this image.' ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs \ No newline at end of file diff --git a/docstore/a758c6fe-5c66-4199-b5f3-65d0754416d4 b/docstore/a758c6fe-5c66-4199-b5f3-65d0754416d4 new file mode 100644 index 0000000000000000000000000000000000000000..8219f10e184a0891e4bb35822a37a2ddc4e20372 --- /dev/null +++ b/docstore/a758c6fe-5c66-4199-b5f3-65d0754416d4 @@ -0,0 +1 @@ +"type": "STRING" }, "ingredients": { "type": "ARRAY", "items": { "type": "STRING" } } }, "propertyOrdering": ["recipeName", "ingredients"] } } } }' 2 > /dev/null | head The output might look like this: [ { "recipeName" : "Chocolate Chip Cookies" , "ingredients" : [ "1 cup (2 sticks) unsalted butter, softened" , "3/4 cup granulated sugar" , "3/4 cup packed brown sugar" , "1 teaspoon vanilla extract" , "2 large eggs" , "2 1/4 cups all-purpose flour" , "1 teaspoon baking soda" , "1 teaspoon salt" , "2 cups chocolate chips" ] }, ... ] Providing a schema in a text prompt Instead of configuring a schema, you can supply a schema as natural language or pseudo-code in a text prompt. This method is not recommended , because it might produce lower quality output, and because the model is not constrained to follow the schema. Warning: Don't provide a schema in a text prompt if you're configuring a responseSchema . This can produce unexpected or low quality results. Here's a generic example of a schema provided in a text prompt: List a few popular cookie recipes, and include the amounts of ingredients. Produce JSON matching this specification: Recipe = { "recipeName": string, "ingredients": array } Return: array Since the model gets the schema from text in the prompt, you might have some flexibility in how you represent the schema. But when you supply a schema inline like this, the model is not actually constrained to return JSON. For a more deterministic, higher quality response, configure a schema on the model, and don't duplicate the schema in the text prompt. Generating enum values In some cases you might want the model to choose a single option from a list of options. To implement this behavior, you can pass an enum in your schema. You can use an enum option anywhere you could use a string in the responseSchema , because an enum is an array of strings. Like a JSON schema, an enum lets you constrain model output to meet the requirements of your application. \ No newline at end of file diff --git a/docstore/a7600810-5b6c-447c-95c9-0eac4802fae7 b/docstore/a7600810-5b6c-447c-95c9-0eac4802fae7 new file mode 100644 index 0000000000000000000000000000000000000000..da790334610df2eb1eb6ac9bb30354d4f1eafe8b --- /dev/null +++ b/docstore/a7600810-5b6c-447c-95c9-0eac4802fae7 @@ -0,0 +1 @@ +the output you want. Adding these examples can help the model identify the patterns and apply the relationship between the given images and responses to the new example. This is also called "few-shot" learning. In the example below, the initial output is written in sentence form, and also contains the country (Brazil). Suppose you want the output in a different format or style, and you want only the city, not the country. Adding few-shot examples to your prompt can steer the model to respond in the way you want. Prompt Model response Determine the city along with the landmark. The landmark is the Christ the Redeemer statue in Rio de Janeiro, Brazil. Updated prompt Improved response Determine the city along with the landmark. city: Rome, landmark: the Colosseum. city: Beijing, landmark: Forbidden City city: Rio de Janeiro, landmark: Christ the Redeemer statue Break it down step-by-step For complex tasks like the ones that require both visual understanding and reasoning, it can be helpful to split the task into smaller, more straightforward steps. Alternatively, it could also be effective if you directly ask the model to “think step by step” in your prompt. Prompt Model response When will I run out of toilet paper? Soon, you only have 3 rolls left. Updated prompt Improved response 1. First, count how many toilet paper rolls are in this picture. 2. Then, determine how much toilet paper a typical person uses per day. 3. Calculate how long these rolls of toilet paper will last. 1. There are 3 rolls of toilet paper in this picture. 2. A typical person uses about 20 sheets of toilet paper per day. 3. If each roll contains 200 sheets, then each roll will last for about 10 days. Therefore, the 3 rolls will last for about a month. Math problems or other types of word problems are great candidates for asking the model to think step-by-step. Prompt Response What is the 4th term in the sequence? -135 The response from the model is incorrect. Some ways to improve this is to ask \ No newline at end of file diff --git a/docstore/a7600a4e-17c6-4365-89fc-ccf0ff4cb126 b/docstore/a7600a4e-17c6-4365-89fc-ccf0ff4cb126 new file mode 100644 index 0000000000000000000000000000000000000000..9670cc28c51e28ab1129c38367a73e00f33ff60c --- /dev/null +++ b/docstore/a7600a4e-17c6-4365-89fc-ccf0ff4cb126 @@ -0,0 +1 @@ +meal features brown rice, roasted vegetables, and chicken teriyaki. The brown rice is a healthy and complex carbohydrate that will give you sustained energy. The roasted vegetables are a great way to get your daily dose of vitamins and minerals, and the chicken teriyaki is a delicious and protein-rich option. This meal is also very easy to prepare. Simply cook the brown rice, roast the vegetables, and cook the chicken teriyaki. Once everything is cooked, divide it into meal prep containers and store them in the refrigerator. You can then grab a container and go on busy mornings! If you're looking for a healthy and delicious meal that can be easily prepped ahead of time, this meal is a great option. It's packed with nutrients and flavor, and it's sure to keep you feeling full and satisfied. Here's to healthy and delicious meal prepping! If you are having trouble getting the output you want from prompts that use media files, there are some strategies that can help you get the results you want. The following sections provide design approaches and troubleshooting tips for improving prompts that use multimodal input. You can improve your multimodal prompts by following these best practices: Prompt design fundamentals Be specific in your instructions : Craft clear and concise instructions that leave minimal room for misinterpretation. Add a few examples to your prompt: Use realistic few-shot examples to illustrate what you want to achieve. Break it down step-by-step : Divide complex tasks into manageable sub-goals, guiding the model through the process. Specify the output format : In your prompt, ask for the output to be in the format you want, like markdown, JSON, HTML and more. Put your image first for single-image prompts : While Gemini can handle image and text inputs in any order, for prompts containing a single image, it might perform better if that image (or video) is placed before the text prompt. However, for prompts that require images to be highly interleaved \ No newline at end of file diff --git a/docstore/a7612ff5-81c4-49b8-a4ee-dc96a6bf0e47 b/docstore/a7612ff5-81c4-49b8-a4ee-dc96a6bf0e47 new file mode 100644 index 0000000000000000000000000000000000000000..8219f10e184a0891e4bb35822a37a2ddc4e20372 --- /dev/null +++ b/docstore/a7612ff5-81c4-49b8-a4ee-dc96a6bf0e47 @@ -0,0 +1 @@ +"type": "STRING" }, "ingredients": { "type": "ARRAY", "items": { "type": "STRING" } } }, "propertyOrdering": ["recipeName", "ingredients"] } } } }' 2 > /dev/null | head The output might look like this: [ { "recipeName" : "Chocolate Chip Cookies" , "ingredients" : [ "1 cup (2 sticks) unsalted butter, softened" , "3/4 cup granulated sugar" , "3/4 cup packed brown sugar" , "1 teaspoon vanilla extract" , "2 large eggs" , "2 1/4 cups all-purpose flour" , "1 teaspoon baking soda" , "1 teaspoon salt" , "2 cups chocolate chips" ] }, ... ] Providing a schema in a text prompt Instead of configuring a schema, you can supply a schema as natural language or pseudo-code in a text prompt. This method is not recommended , because it might produce lower quality output, and because the model is not constrained to follow the schema. Warning: Don't provide a schema in a text prompt if you're configuring a responseSchema . This can produce unexpected or low quality results. Here's a generic example of a schema provided in a text prompt: List a few popular cookie recipes, and include the amounts of ingredients. Produce JSON matching this specification: Recipe = { "recipeName": string, "ingredients": array } Return: array Since the model gets the schema from text in the prompt, you might have some flexibility in how you represent the schema. But when you supply a schema inline like this, the model is not actually constrained to return JSON. For a more deterministic, higher quality response, configure a schema on the model, and don't duplicate the schema in the text prompt. Generating enum values In some cases you might want the model to choose a single option from a list of options. To implement this behavior, you can pass an enum in your schema. You can use an enum option anywhere you could use a string in the responseSchema , because an enum is an array of strings. Like a JSON schema, an enum lets you constrain model output to meet the requirements of your application. \ No newline at end of file diff --git a/docstore/a769b08c-d625-4fde-9362-97e3e3d6cd5e b/docstore/a769b08c-d625-4fde-9362-97e3e3d6cd5e new file mode 100644 index 0000000000000000000000000000000000000000..ebd105342549a255faf01232b49ba70d20b000ef --- /dev/null +++ b/docstore/a769b08c-d625-4fde-9362-97e3e3d6cd5e @@ -0,0 +1 @@ +the next tier. Why use the paid tier? When you enable billing and use the paid tier, you benefit from higher rate limits , and your prompts and responses aren't used to improve Google products. For more information on data use for paid services, see the terms of service . Cloud Billing The Gemini API uses Cloud Billing for billing services. To use the paid tier, you must set up Cloud Billing on your cloud project. After you've enabled Cloud Billing, you can use Cloud Billing tools to track spending, understand costs, make payments, and access Cloud Billing support. Enable billing You can enable Cloud Billing starting from Google AI Studio: Open Google AI Studio . In the bottom of the left sidebar, select Settings > Plan information . Click Set up Billing for your chosen project to enable Cloud Billing. Monitor usage After you enable Cloud Billing, you can monitor your usage of the Gemini API in the Google Cloud console . The service name for the API is generativelanguage.googleapis.com , and in the console the Gemini API is also referred to as the Generative Language API . To learn more, see the Google Cloud documentation on monitoring API usage . Frequently asked questions This section provides answers to frequently asked questions. What am I billed for? Gemini API pricing is based on the following: Input token count Output token count Cached token count Cached token storage duration For pricing information, see the pricing page . Where can I view my quota? You can view your quota and system limits in the Google Cloud console . How do I request more quota? To request more quota, follow the instructions at How to request an upgrade . Can I use the Gemini API for free in EEA (including EU), the UK, and CH? Yes, we make the free tier and paid tier available in many regions . If I set up billing with the Gemini API, will I be charged for my Google AI Studio usage? No, Google AI Studio usage remains free of charge regardless of if you set up billing across all supported \ No newline at end of file diff --git a/docstore/a7880b2b-c4ba-44dc-821b-1153e62494a6 b/docstore/a7880b2b-c4ba-44dc-821b-1153e62494a6 new file mode 100644 index 0000000000000000000000000000000000000000..7645b864913317d4ec923e00d51796055880e22d --- /dev/null +++ b/docstore/a7880b2b-c4ba-44dc-821b-1153e62494a6 @@ -0,0 +1 @@ +https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:batchGenerateContent \ -X POST \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type:application/json" \ -d "{ 'batch': { 'display_name': 'my-batch-requests', 'input_config': { 'requests': { 'file_name': ${ BATCH_INPUT_FILE } } } } }" When you create a batch job, you will get a job name returned. Use this name for monitoring the job status as well as retrieving the results once the job completes. The following is an example output that contains a job name: Created batch job from file: batches/123456789 Monitoring job status Use the operation name obtained when creating the batch job to poll its status. The state field of the batch job will indicate its current status. A batch job can be in one of the following states: JOB_STATE_PENDING : The job has been created and is waiting to be processed by the service. JOB_STATE_SUCCEEDED : The job completed successfully. You can now retrieve the results. JOB_STATE_FAILED : The job failed. Check the error details for more information. JOB_STATE_CANCELLED : The job was cancelled by the user. You can poll the job status periodically to check for completion. Python # Use the name of the job you want to check # e.g., inline_batch_job.name from the previous step job_name = "YOUR_BATCH_JOB_NAME" # (e.g. 'batches/your-batch-id') batch_job = client . batches . get ( name = job_name ) completed_states = set ([ 'JOB_STATE_SUCCEEDED' , 'JOB_STATE_FAILED' , 'JOB_STATE_CANCELLED' , ]) print ( f "Polling status for job: { job_name } " ) batch_job = client . batches . get ( name = job_name ) # Initial get while batch_job . state . name not in completed_states : print ( f "Current state: { batch_job . state . name } " ) time . sleep ( 30 ) # Wait for 30 seconds before polling again batch_job = client . batches . get ( name = job_name ) print ( f "Job finished with state: { batch_job . state . name } " ) if batch_job . state . name == 'JOB_STATE_FAILED' : print ( f \ No newline at end of file diff --git a/docstore/a78dee29-dae8-4738-abf0-1582dd8d48d2 b/docstore/a78dee29-dae8-4738-abf0-1582dd8d48d2 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/a78dee29-dae8-4738-abf0-1582dd8d48d2 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/a7961f12-209d-4b56-be96-5cb4386b2280 b/docstore/a7961f12-209d-4b56-be96-5cb4386b2280 new file mode 100644 index 0000000000000000000000000000000000000000..ae967e564d1a537297ba481c42281d32203b30fe --- /dev/null +++ b/docstore/a7961f12-209d-4b56-be96-5cb4386b2280 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/video-understanding#youtube Title: Video understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/a797289a-699a-4023-8ff3-fd6d10038e1b b/docstore/a797289a-699a-4023-8ff3-fd6d10038e1b new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/a797289a-699a-4023-8ff3-fd6d10038e1b @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/a79f4f56-c69c-4c41-8ad7-a2a7403bcbf4 b/docstore/a79f4f56-c69c-4c41-8ad7-a2a7403bcbf4 new file mode 100644 index 0000000000000000000000000000000000000000..1785bc52e69ea8511733e76440059ca251dc2784 --- /dev/null +++ b/docstore/a79f4f56-c69c-4c41-8ad7-a2a7403bcbf4 @@ -0,0 +1 @@ +DISPLAY_NAME_2 = "Gemini_1.5_paper" PROMPT = "What is the difference between each of the main benchmarks between these two papers? Output these in a table." # Function to download and upload a PDF upload_pdf () { local doc_url = " $1 " local display_name = " $2 " # Download the PDF wget -O " ${ display_name } .pdf" " ${ doc_url } " local MIME_TYPE = $( file -b --mime-type " ${ display_name } .pdf" ) local NUM_BYTES = $( wc -c < " ${ display_name } .pdf" ) echo "MIME_TYPE: ${ MIME_TYPE } " echo "NUM_BYTES: ${ NUM_BYTES } " local tmp_header_file = upload-header.tmp # Initial resumable request curl " ${ BASE_URL } /upload/v1beta/files?key= ${ GOOGLE_API_KEY } " \ -D " ${ tmp_header_file } " \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ display_name } '}}" 2 > /dev/null local upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the PDF curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ display_name } .pdf" 2 > /dev/null > "file_info_ ${ display_name } .json" local file_uri = $( jq ".file.uri" "file_info_ ${ display_name } .json" ) echo "file_uri for ${ display_name } : ${ file_uri } " # Clean up the downloaded PDF rm " ${ display_name } .pdf" echo " ${ file_uri } " } # Upload the first PDF file_uri_1 = $( upload_pdf " ${ DOC_URL_1 } " " ${ DISPLAY_NAME_1 } " ) # Upload the second PDF file_uri_2 = $( upload_pdf " ${ DOC_URL_2 } " " ${ DISPLAY_NAME_2 } " ) # Now generate content using both files curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key= $GOOGLE_API_KEY " \ -H 'Content-Type: application/json' \ -X \ No newline at end of file diff --git a/docstore/a7a586c0-c522-4416-ad3a-de953b860473 b/docstore/a7a586c0-c522-4416-ad3a-de953b860473 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/a7a586c0-c522-4416-ad3a-de953b860473 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/a7a5d58d-0304-45ce-8f4c-b4fce5c40e74 b/docstore/a7a5d58d-0304-45ce-8f4c-b4fce5c40e74 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/a7a5d58d-0304-45ce-8f4c-b4fce5c40e74 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/a7c50068-b946-49b4-aff2-fcd6dbb6a157 b/docstore/a7c50068-b946-49b4-aff2-fcd6dbb6a157 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/a7c50068-b946-49b4-aff2-fcd6dbb6a157 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/a7d17be4-7f33-4db8-8724-8407030d8d2f b/docstore/a7d17be4-7f33-4db8-8724-8407030d8d2f new file mode 100644 index 0000000000000000000000000000000000000000..7c01a8e52c6f77d499351d1468f50bccd1328e77 --- /dev/null +++ b/docstore/a7d17be4-7f33-4db8-8724-8407030d8d2f @@ -0,0 +1 @@ +function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); // Combine audio data strings and save as wave file const combinedAudio = turns . reduce (( acc , turn ) = > { if ( turn . data ) { const buffer = Buffer . from ( turn . data , 'base64' ); const intArray = new Int16Array ( buffer . buffer , buffer . byteOffset , buffer . byteLength / Int16Array . BYTES_PER_ELEMENT ); return acc . concat ( Array . from ( intArray )); } return acc ; }, []); const audioBuffer = new Int16Array ( combinedAudio ); const wf = new WaveFile (); wf . fromScratch ( 1 , 24000 , '16' , audioBuffer ); fs . writeFileSync ( 'output.wav' , wf . toBuffer ()); session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Audio formats Audio data in the Live API is always raw, little-endian, 16-bit PCM. Audio output always uses a sample rate of 24kHz. Input audio is natively 16kHz, but the Live API will resample if needed so any sample rate can be sent. To convey the sample rate of input audio, set the MIME type of each audio-containing Blob to a value like audio/pcm;rate=16000 . Audio transcriptions You can enable transcription of the model's audio output by sending output_audio_transcription in the setup config. The transcription language is inferred from the model's response. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ], "output_audio_transcription" : {} } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello? Gemini are you there?" await session . send_client_content ( turns = { \ No newline at end of file diff --git a/docstore/a7ea02fc-7d7e-4913-9e7b-56f8c298ec26 b/docstore/a7ea02fc-7d7e-4913-9e7b-56f8c298ec26 new file mode 100644 index 0000000000000000000000000000000000000000..c3dd5ccab4f70d7a2a8a4ffb78dc41dee62c1a3f --- /dev/null +++ b/docstore/a7ea02fc-7d7e-4913-9e7b-56f8c298ec26 @@ -0,0 +1 @@ +with texts to make sense, use whatever order is most natural. Troubleshooting your multimodal prompt If the model is not drawing information from the relevant part of the image: Drop hints with which aspects of the image you want the prompt to draw information from. If the model output is too generic (not tailored enough to the image/video input): At the start of the prompt, try asking the model to describe the image(s) or video before providing the task instruction, or try asking the model to refer to what's in the image. To troubleshoot which part failed: Ask the model to describe the image, or ask the model to explain its reasoning, to gauge the model's initial understanding. If your prompt results in hallucinated content: Try dialing down the temperature setting or asking the model for shorter descriptions so that it's less likely to extrapolate additional details. Tuning the sampling parameters: Experiment with different temperature settings and top-k selections to adjust the model's creativity. Be specific in your instructions Prompts have the most success when they are clear and detailed. If you have a specific output in mind, it's better to include that requirement in the prompt to ensure you get the output you want. For this image of an airport board, asking the model to just "describe this image" could generate a general description. If you need the model to parse the time and city from the image, you can include that request directly in your prompt. Prompt Model response Describe this image. The image shows an airport arrivals and departures board. Updated prompt Improved response Parse the time and city from the airport board shown in this image into a list. 10:50 Moscow 11:05 Edinburgh 11:05 London 11:10 Bucharest 11:30 Kiev 11:35 Dublin 11:45 East Midlands 12:15 Sofia 12:30 London 12:30 Newcastle 12:40 St Petersburg 12:40 London 12:45 Manchester Add a few examples The Gemini model can accept multiple inputs which it can use as examples to understand \ No newline at end of file diff --git a/docstore/a7f41a55-5fc7-4816-9697-6d91a1ef49d7 b/docstore/a7f41a55-5fc7-4816-9697-6d91a1ef49d7 new file mode 100644 index 0000000000000000000000000000000000000000..bf98246a4d5f20dab4e649ac0598b2bfac1851f5 --- /dev/null +++ b/docstore/a7f41a55-5fc7-4816-9697-6d91a1ef49d7 @@ -0,0 +1 @@ +" ) REST tmp_batch_input_file = batch_input.tmp echo -e '{"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}], "generationConfig": {"temperature": 0.7}}\n{"contents": [{"parts": [{"text": "What are the main ingredients in a Margherita pizza?"}]}]}' > batch_input.tmp MIME_TYPE = $( file -b --mime-type " ${ tmp_batch_input_file } " ) NUM_BYTES = $( wc -c < " ${ tmp_batch_input_file } " ) DISPLAY_NAME = BatchInput tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files \ -D " ${ tmp_header_file } " \ -H " x-goog-api-key: $GEMINI_API_KEY " \ -H " X-Goog-Upload-Protocol: resumable " \ -H " X-Goog-Upload-Command: start " \ -H " X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H " X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H " Content-Type: application/jsonl " \ -d " { 'file' : { 'display_name' : '${DISPLAY_NAME}' }} " 2> /dev/null upload_url= $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H " Content-Length: ${ NUM_BYTES } " \ -H " X-Goog-Upload-Offset: 0 " \ -H " X-Goog-Upload-Command: upload, finalize " \ --data-binary " @ ${ tmp_batch_input_file } " 2> /dev/null > file_info.json file_uri= $( jq ".file.uri" file_info.json ) The following example calls the BatchGenerateContent method with the input file uploaded using File API: Python # Assumes `uploaded_file` is the file object from the previous step file_batch_job = client . batches . create ( model = "gemini-2.5-flash" , src = uploaded_file . name , config = { 'display_name' : "file-upload-job-1" , }, ) print ( f "Created batch job: { file_batch_job . name } " ) REST BATCH_INPUT_FILE = 'files/123456' # File ID curl \ No newline at end of file diff --git a/docstore/a7f59b79-5c10-4ad0-a178-fcfa07db297b b/docstore/a7f59b79-5c10-4ad0-a178-fcfa07db297b new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/a7f59b79-5c10-4ad0-a178-fcfa07db297b @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/a7fa4cbd-889a-4c1b-b09e-1bb121a5b7d8 b/docstore/a7fa4cbd-889a-4c1b-b09e-1bb121a5b7d8 new file mode 100644 index 0000000000000000000000000000000000000000..b49656489ada29d3fcb1875fdf6f7f73e44c8709 --- /dev/null +++ b/docstore/a7fa4cbd-889a-4c1b-b09e-1bb121a5b7d8 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/speech-generation#limitations Title: Speech generation (text-to-speech) | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/a806cc39-6669-4739-97e5-9026dd505b37 b/docstore/a806cc39-6669-4739-97e5-9026dd505b37 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/a806cc39-6669-4739-97e5-9026dd505b37 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/a80d8ed2-a8a9-4f59-993b-6c08675049b0 b/docstore/a80d8ed2-a8a9-4f59-993b-6c08675049b0 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/a80d8ed2-a8a9-4f59-993b-6c08675049b0 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/a80da14a-ffab-4278-872f-f03b14c5248d b/docstore/a80da14a-ffab-4278-872f-f03b14c5248d new file mode 100644 index 0000000000000000000000000000000000000000..aa68cf0548cfb2585cc033502540f8f159dbfde7 --- /dev/null +++ b/docstore/a80da14a-ffab-4278-872f-f03b14c5248d @@ -0,0 +1 @@ +Gemini API has built-in protections against core harms, such as content that endangers child safety. These types of harm are always blocked and cannot be adjusted. Content safety filtering level The Gemini API categorizes the probability level of content being unsafe as HIGH , MEDIUM , LOW , or NEGLIGIBLE . The Gemini API blocks content based on the probability of content being unsafe and not the severity. This is important to consider because some content can have low probability of being unsafe even though the severity of harm could still be high. For example, comparing the sentences: The robot punched me. The robot slashed me up. The first sentence might result in a higher probability of being unsafe, but you might consider the second sentence to be a higher severity in terms of violence. Given this, it is important that you carefully test and consider what the appropriate level of blocking is needed to support your key use cases while minimizing harm to end users. Safety filtering per request You can adjust the safety settings for each request you make to the API. When you make a request, the content is analyzed and assigned a safety rating. The safety rating includes the category and the probability of the harm classification. For example, if the content was blocked due to the harassment category having a high probability, the safety rating returned would have category equal to HARASSMENT and harm probability set to HIGH . By default, safety settings block content (including prompts) with medium or higher probability of being unsafe across any filter. This baseline safety is designed to work for most use cases, so you should only adjust your safety settings if it's consistently required for your application. The following table describes the block settings you can adjust for each category. For example, if you set the block setting to Block few for the Hate speech category, everything that has a high probability of being hate speech content is blocked. But \ No newline at end of file diff --git a/docstore/a8289f76-2428-434e-b717-d3c7e9ddba22 b/docstore/a8289f76-2428-434e-b717-d3c7e9ddba22 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/a8289f76-2428-434e-b717-d3c7e9ddba22 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/a834742a-ce34-4a5e-a3fd-669bcd7cd9bd b/docstore/a834742a-ce34-4a5e-a3fd-669bcd7cd9bd new file mode 100644 index 0000000000000000000000000000000000000000..c1215478fcfc0a791e03023378a8264733510dd8 --- /dev/null +++ b/docstore/a834742a-ce34-4a5e-a3fd-669bcd7cd9bd @@ -0,0 +1 @@ +connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.pcm" ); const base64Audio = Buffer . from ( fileBuffer ). toString ( 'base64' ); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); // if stream gets paused, send: // session.sendRealtimeInput({ audioStreamEnd: true }) const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); With send_realtime_input , the API will respond to audio automatically based on VAD. While send_client_content adds messages to the model context in order, send_realtime_input is optimized for responsiveness at the expense of deterministic ordering. Automatic VAD configuration For more control over the VAD activity, you can configure the following parameters. See API reference for more info. Python from google.genai import types config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : False , # default "start_of_speech_sensitivity" : types . StartSensitivity . START_SENSITIVITY_LOW , "end_of_speech_sensitivity" : types . EndSensitivity . END_SENSITIVITY_LOW , "prefix_padding_ms" : 20 , "silence_duration_ms" : 100 , } } } JavaScript import { GoogleGenAI , Modality , StartSensitivity , EndSensitivity } from '@google/genai' ; const config = { responseModalities : [ Modality . \ No newline at end of file diff --git a/docstore/a866444c-a916-4977-97d7-fa1fc1cf22c5 b/docstore/a866444c-a916-4977-97d7-fa1fc1cf22c5 new file mode 100644 index 0000000000000000000000000000000000000000..5c2e3d449ab3ef2facf74289f181cd7d6659749f --- /dev/null +++ b/docstore/a866444c-a916-4977-97d7-fa1fc1cf22c5 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#live-api Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/a874b0e5-0adb-4986-a173-d7f6700181a6 b/docstore/a874b0e5-0adb-4986-a173-d7f6700181a6 new file mode 100644 index 0000000000000000000000000000000000000000..c6aa65d262d73786cc7d106425f2574a0a896d12 --- /dev/null +++ b/docstore/a874b0e5-0adb-4986-a173-d7f6700181a6 @@ -0,0 +1 @@ +the modal, you can use the sliders to adjust the content filtering level per safety category: Note: If you set any of the category filters to Block none , Google AI Studio will display a reminder about the Gemini API's Terms of Service with respect to safety settings. When you send a request (for example, by asking the model a question), a warning No Content message appears if the request's content is blocked. To see more details, hold the pointer over the No Content text and click warning Safety . Gemini API SDKs The following code snippet shows how to set safety settings in your GenerateContent call. This sets the thresholds for the harassment ( HARM_CATEGORY_HARASSMENT ) and hate speech ( HARM_CATEGORY_HATE_SPEECH ) categories. For example, setting these categories to BLOCK_LOW_AND_ABOVE blocks any content that has a low or higher probability of being harassment or hate speech. To understand the threshold settings, see Safety filtering per request . Python from google import genai from google.genai import types import PIL.Image img = PIL . Image . open ( "cookies.jpg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ 'Do these look store-bought or homemade?' , img ], config = types . GenerateContentConfig ( safety_settings = [ types . SafetySetting ( category = types . HarmCategory . HARM_CATEGORY_HATE_SPEECH , threshold = types . HarmBlockThreshold . BLOCK_LOW_AND_ABOVE , ), ] ) ) print ( response . text ) Go package main import ( "context" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { SafetySettings : [] * genai . SafetySetting { { Category : "HARM_CATEGORY_HATE_SPEECH" , Threshold : "BLOCK_LOW_AND_ABOVE" , }, }, } response , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , genai . Text ( "Some potentially \ No newline at end of file diff --git a/docstore/a8aee5af-cd76-45e3-aaee-461c2e4d1d76 b/docstore/a8aee5af-cd76-45e3-aaee-461c2e4d1d76 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/a8aee5af-cd76-45e3-aaee-461c2e4d1d76 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/a8b1329d-1f69-4be2-8b3a-148d7239fdcd b/docstore/a8b1329d-1f69-4be2-8b3a-148d7239fdcd new file mode 100644 index 0000000000000000000000000000000000000000..964387d039ef80cfbc57b203b39f125290e566d4 --- /dev/null +++ b/docstore/a8b1329d-1f69-4be2-8b3a-148d7239fdcd @@ -0,0 +1 @@ +using the text below. Respond with only the text provided. Question: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Text: Color: Slowly pulsing yellow What it means: There is a network error. What to do: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. Color: Fast blinking yellow What it means: You are holding down the reset button and are factory resetting this device. What to do: If you keep holding down the reset button, after about 12 seconds, the light will turn solid yellow. Once it is solid yellow, let go of the factory reset button. Color: Solid yellow What it means: Router is factory resetting. What to do: This can take up to 10 minutes. When it's done, the device will reset itself and start pulsing white, letting you know it's ready for setup. Color: Solid red What it means: Something is wrong. What to do: Critical failure. Factory reset the router. If the light stays red, contact Wifi customer support. Response: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. (gemini-2.5-flash) Add prefixes A prefix is a word or phrase that you add to the prompt content that can serve several purposes, depending on where you put the prefix: Input prefix: Adding a prefix to the input signals semantically meaningful parts of the input to the model. For example, the prefixes "English:" and "French:" demarcate two different languages. Output prefix: Even though the output is generated by the model, you can add a prefix for the output in the prompt. The output prefix gives the model information about what's expected as a response. For example, the output prefix "JSON:" signals to the model that the output should be in JSON format. Example prefix: In few-shot prompts, adding prefixes \ No newline at end of file diff --git a/docstore/a8cb7ad6-957a-41b2-ad2e-cc3f8551560b b/docstore/a8cb7ad6-957a-41b2-ad2e-cc3f8551560b new file mode 100644 index 0000000000000000000000000000000000000000..c6b31d98c2b16f02ca76b6157fd854c0b53727af --- /dev/null +++ b/docstore/a8cb7ad6-957a-41b2-ad2e-cc3f8551560b @@ -0,0 +1 @@ +: [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Thinking with Gemini 2.5 2.5 Flash and Pro models have "thinking" enabled by default to enhance quality, which may take longer to run and increase token usage. When using 2.5 Flash, you can disable thinking by setting the thinking budget to zero. For more details, see the thinking guide . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # Disables thinking ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , config : { thinkingConfig : { thinkingBudget : 0 , // Disables thinking }, } }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "How does AI work?" ), & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : int32 ( 0 ), // Disables thinking }, } ) \ No newline at end of file diff --git a/docstore/a8d3154c-a6fd-4a26-adb6-ecf0aae080d8 b/docstore/a8d3154c-a6fd-4a26-adb6-ecf0aae080d8 new file mode 100644 index 0000000000000000000000000000000000000000..38b0d511edf99b6e46520aefe0a8854155d4053f --- /dev/null +++ b/docstore/a8d3154c-a6fd-4a26-adb6-ecf0aae080d8 @@ -0,0 +1 @@ +energetic music, and dimmed the lights to 50% brightness. Let's get this party started! Compositional function calling Compositional or sequential function calling allows Gemini to chain multiple function calls together to fulfill a complex request. For example, to answer "Get the temperature in my current location", the Gemini API might first invoke a get_current_location() function followed by a get_weather() function that takes the location as a parameter. The following example demonstrates how to implement compositional function calling using the Python SDK and automatic function calling. Python This example uses the automatic function calling feature of the google-genai Python SDK. The SDK automatically converts the Python functions to the required schema, executes the function calls when requested by the model, and sends the results back to the model to complete the task. import os from google import genai from google.genai import types # Example Functions def get_weather_forecast ( location : str ) - > dict : """Gets the current weather temperature for a given location.""" print ( f "Tool Call: get_weather_forecast(location= { location } )" ) # TODO: Make API call print ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ) return { "temperature" : 25 , "unit" : "celsius" } # Dummy response def set_thermostat_temperature ( temperature : int ) - > dict : """Sets the thermostat to a desired temperature.""" print ( f "Tool Call: set_thermostat_temperature(temperature= { temperature } )" ) # TODO: Interact with a thermostat API print ( "Tool Response: {'status': 'success'}" ) return { "status" : "success" } # Configure the client and model client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_weather_forecast , set_thermostat_temperature ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to \ No newline at end of file diff --git a/docstore/a8dc8311-57fb-409e-8c7d-0de98e952c07 b/docstore/a8dc8311-57fb-409e-8c7d-0de98e952c07 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/a8dc8311-57fb-409e-8c7d-0de98e952c07 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/a8dd1859-4eb1-4b3b-93ef-6ac73014a050 b/docstore/a8dd1859-4eb1-4b3b-93ef-6ac73014a050 new file mode 100644 index 0000000000000000000000000000000000000000..e4d61b25460e3f7a88247fee03ad54970e9dbee5 --- /dev/null +++ b/docstore/a8dd1859-4eb1-4b3b-93ef-6ac73014a050 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/live-guide#supported-languages Title: Live API capabilities guide | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/a948edd6-0ad9-44cf-9c34-b9f9fa9cb803 b/docstore/a948edd6-0ad9-44cf-9c34-b9f9fa9cb803 new file mode 100644 index 0000000000000000000000000000000000000000..a1daa64b7091fa9e5594d9222d90d8c8ad6521d2 --- /dev/null +++ b/docstore/a948edd6-0ad9-44cf-9c34-b9f9fa9cb803 @@ -0,0 +1 @@ +). You can listen to all the voices in AI Studio . To specify a voice, set the voice name within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "voice_config" : { "prebuilt_voice_config" : { "voice_name" : "Kore" }} }, } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" } } } }; Note: If you're using the generateContent API, the set of available voices is slightly different. See the audio generation guide for generateContent audio generation voices. The Live API supports multiple languages . To change the language, set the language code within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "language_code" : "de-DE" } } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { languageCode : "de-DE" } }; Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Native audio capabilities The following capabilities are only available with native audio. You can learn more about native audio in Choose a model and audio generation . Note: Native audio models currently have limited tool use support. See Overview of supported tools for details. How to use native audio output To use native audio output, configure one of the native audio models and set response_modalities to AUDIO . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-preview-native-audio-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio . live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-preview-native-audio-dialog' ; const config = { responseModalities : [ \ No newline at end of file diff --git a/docstore/a959b0b3-3981-40c8-93c1-0504187fb0e1 b/docstore/a959b0b3-3981-40c8-93c1-0504187fb0e1 new file mode 100644 index 0000000000000000000000000000000000000000..72fc1af4a198f3f6ddd1bd2823f7a9c0e142bbbb --- /dev/null +++ b/docstore/a959b0b3-3981-40c8-93c1-0504187fb0e1 @@ -0,0 +1 @@ +should be sent to flush any cached audio. The client can resume sending audio data at any time. Python # example audio file to try: # URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" # !wget -q $URL -O sample.pcm import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_bytes = Path ( "sample.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) # if stream gets paused, send: # await session.send_realtime_input(audio_stream_end=True) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // example audio file to try: // URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" // !wget -q $URL -O sample.pcm import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . \ No newline at end of file diff --git a/docstore/a95f6877-eceb-4a5b-b08a-7fd0243e4299 b/docstore/a95f6877-eceb-4a5b-b08a-7fd0243e4299 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/a95f6877-eceb-4a5b-b08a-7fd0243e4299 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/a96681ce-b034-41a2-baae-54ed699bfc85 b/docstore/a96681ce-b034-41a2-baae-54ed699bfc85 new file mode 100644 index 0000000000000000000000000000000000000000..0abe7c770a1c93708b98ee8b0a34df5d347d5c9d --- /dev/null +++ b/docstore/a96681ce-b034-41a2-baae-54ed699bfc85 @@ -0,0 +1 @@ +candidates [ 0 ]. content ; contents . push ( function_response_content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); The following shows what a request returning a thought signature may look like: [{ "contents" : [ { "role" : "user" , "parts" : [ { "text" : "what is the weather in Lake Tahoe?" } ] } , { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : "CiIBVKhc7oDPpCaXyJKKssjqr4g3JNOSgJ/M2V+1THC1icsWCmwBVKhc7pBABbZ+zR3e9234WnWWS6GFXmf8IVwpnzjd5KYd7vyJbn/4vTorWBGayj/vbd9JPaZQjxdAIXhoE5mX/MDsQ7M9N/b0qJjHm39tYIBvS4sIWkMDHqTJqXGLzhhKtrTkfbV3RbaJEkQKmwEBVKhc7qVUgC3hfTXZLo9R3AJzUUIx50NKvJTb9B+UU+LBqgg7Nck1x5OpjWVS2R+SsveprIuYOruk2Y0H53J2OJF8qsxTdIq2si8DGW2V7WK8xyoJH5kbqd7drIw1jLb44b6lx4SMyB0VaULuTBki4d+Ljjg1tJTwR0IYMKqDLDZt9mheINsi0ZxcNjfpnDydRXdWbcSwzmK/wgqJAQFUqFzuKgNVElxs3cbO+xebr2IwcOro84nKTisi0tTp9bICPC9fTUhn3L+rvQWA+d3J1Za8at2bakrqiRj7BTh+CVO9fWQMAEQAs3ni0Z2hfaYG92tOD26E4IoZwyYEoWbfNudpH1fr5tEkyqnEGtWIh7H+XoZQ2DXeiOa+br7Zk88SrNE+trJMCogBAVSoXO5e9fBLg7hnbkmKsrzNLnQtLsQm1gNzjcjEC7nJYklYPp0KI2uGBE1PkM8XNsfllAfHVn7LzHcHNlbQ9pJ7QZTSIeG42goS971r5wNZwxaXwCTphClQh826eqJWo6A/28TtAVQWLhTx5ekbP7qb4nh1UblESZ1saxDQAEo4OKPbDzx5BgqKAQFUqFzuVyjNm5i0wN8hTDnKjfpDroEpPPTs531iFy9BOX+xDCdGHy8D+osFpaoBq6TFekQQbz4hIoUR1YEcP4zI80/cNimEeb9IcFxZTTxiNrbhbbcv0969DSMWhB+ZEqIz4vuw4GLe/xcUvqhlChQwFdgIbdOQHSHpatn5uDlktnP/bi26nKuXIwo0AVSoXO7US22OUH7d1f4abNPI0IyAvhqkPp12rbtWLx9vkOtojE8IP+xCfYtIFuZIzRNZqA==" } ] , "role" : "model" } , { "role" : "user" , "parts" : [ { "functionResponse" : { "name" : "getWeather" , "response" : { "response" : { "stringValue" : "Sunny and hot. 90 degrees Fahrenheit" } } } } ] } ] , # Remainder of request... Learn more about limitations and usage of thought signatures, and about \ No newline at end of file diff --git a/docstore/a971589c-37da-42ab-8d85-74310adf7dd7 b/docstore/a971589c-37da-42ab-8d85-74310adf7dd7 new file mode 100644 index 0000000000000000000000000000000000000000..40564cc3a339b41e3f9c5a2f24a7d0082d31abf9 --- /dev/null +++ b/docstore/a971589c-37da-42ab-8d85-74310adf7dd7 @@ -0,0 +1 @@ +response_modalities = [ "AUDIO" ], context_window_compression = ( # Configures compression with default parameters. types . ContextWindowCompressionConfig ( sliding_window = types . SlidingWindow (), ) ), ) JavaScript const config = { responseModalities : [ Modality . AUDIO ], contextWindowCompression : { slidingWindow : {} } }; Session resumption To prevent session termination when the server periodically resets the WebSocket connection, configure the sessionResumption field within the setup configuration . Passing this configuration causes the server to send SessionResumptionUpdate messages, which can be used to resume the session by passing the last resumption token as the SessionResumptionConfig.handle of the subsequent connection. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" async def main (): print ( f "Connecting to the service with handle { previous_session_handle } ..." ) async with client . aio . live . connect ( model = model , config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], session_resumption = types . SessionResumptionConfig ( # The handle of the session to resume is passed here, # or else None to start a new session. handle = previous_session_handle ), ), ) as session : while True : await session . send_client_content ( turns = types . Content ( role = "user" , parts = [ types . Part ( text = "Hello world!" )] ) ) async for message in session . receive (): # Periodically, the server will send update messages that may # contain a handle for the current state of the session. if message . session_resumption_update : update = message . session_resumption_update if update . resumable and update . new_handle : # The handle should be retained and linked to the session. return update . new_handle # For the purposes of this example, placeholder input is continually fed # to the model. In non-sample code, the model inputs would come from # \ No newline at end of file diff --git a/docstore/a97a606e-17e1-42ac-b5b4-8913bd645df0 b/docstore/a97a606e-17e1-42ac-b5b4-8913bd645df0 new file mode 100644 index 0000000000000000000000000000000000000000..7a617ceacc5e968d9729ffe6ff8f1e15b90d626d --- /dev/null +++ b/docstore/a97a606e-17e1-42ac-b5b4-8913bd645df0 @@ -0,0 +1 @@ +multiple attempts yield the best results. Keep it short : Limit text to 25 characters or less for optimal generation. Multiple phrases : Experiment with two or three distinct phrases to provide additional information. Avoid exceeding three phrases for cleaner compositions. Prompt: A poster with the text "Summerland" in bold font as a title, underneath this text is the slogan "Summer never felt so good" Guide Placement : While Imagen can attempt to position text as directed, expect occasional variations. This feature is continually improving. Inspire font style : Specify a general font style to subtly influence Imagen's choices. Don't rely on precise font replication, but expect creative interpretations. Font size : Specify a font size or a general indication of size (for example, small , medium , large ) to influence the font size generation. Prompt parameterization To better control output results, you might find it helpful to parameterize the inputs into Imagen. For example, suppose you want your customers to be able to generate logos for their business, and you want to make sure logos are always generated on a solid color background. You also want to limit the options that the client can select from a menu. In this example, you can create a parameterized prompt similar to the following: A {logo_style} logo for a {company_area} company on a solid color background. Include the text {company_name} . In your custom user interface, the customer can input the parameters using a menu, and their chosen value populates the prompt Imagen receives. For example: Prompt: A minimalist logo for a health care company on a solid color background. Include the text Journey . Prompt: A modern logo for a software company on a solid color background. Include the text Silo . Prompt: A traditional logo for a baking company on a solid color background. Include the text Seed . Advanced prompt writing techniques Use the following examples to create more specific prompts based on attributes \ No newline at end of file diff --git a/docstore/a9805373-b9a4-435c-8b64-d8d5acee637c b/docstore/a9805373-b9a4-435c-8b64-d8d5acee637c new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/a9805373-b9a4-435c-8b64-d8d5acee637c @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/a9839764-f850-4393-87fe-af28801f4e3d b/docstore/a9839764-f850-4393-87fe-af28801f4e3d new file mode 100644 index 0000000000000000000000000000000000000000..88c00154929a60afbed4badff3d232f79f564557 --- /dev/null +++ b/docstore/a9839764-f850-4393-87fe-af28801f4e3d @@ -0,0 +1 @@ +can take many forms including researching state of the art studies in your app domain, observing how people are using similar apps, or running a user study, survey, or conducting informal interviews with potential users. Advanced tips Speak with a diverse mix of prospective users within your target population about your application and its intended purpose so as to get a wider perspective on potential risks and to adjust diversity criteria as needed. The AI Risk Management Framework released by the U.S. government's National Institute of Standards and Technology (NIST) provides more detailed guidance and additional learning resources for AI risk management. DeepMind's publication on the ethical and social risks of harm from language models describes in detail the ways that language model applications can cause harm. Consider adjustments to mitigate safety risks Now that you have an understanding of the risks, you can decide how to mitigate them. Determining which risks to prioritize and how much you should do to try to prevent them is a critical decision, similar to triaging bugs in a software project. Once you've determined priorities, you can start thinking about the types of mitigations that would be most appropriate. Often simple changes can make a difference and reduce risks. For example, when designing an application consider: Tuning the model output to better reflect what is acceptable in your application context. Tuning can make the output of the model more predictable and consistent and therefore can help mitigate certain risks. Providing an input method that facilities safer outputs. The exact input you give to an LLM can make a difference in the quality of the output. Experimenting with input prompts to find what works most safely in your use-case is well worth the effort, as you can then provide a UX that facilitates it. For example, you could restrict users to choose only from a drop-down list of input prompts, or offer pop-up suggestions with \ No newline at end of file diff --git a/docstore/a99527d8-a232-427b-929e-df40f2451025 b/docstore/a99527d8-a232-427b-929e-df40f2451025 new file mode 100644 index 0000000000000000000000000000000000000000..bf4a48096b84622083d96343210f25866e78f754 --- /dev/null +++ b/docstore/a99527d8-a232-427b-929e-df40f2451025 @@ -0,0 +1 @@ +a picture of me. Can you add a llama next to me?" ), & genai . Part { InlineData : & genai . Blob { MIMEType : "image/png" , Data : imgData , }, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } config := & genai . GenerateContentConfig { ResponseModalities : [] string { "TEXT" , "IMAGE" }, } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash-preview-image-generation" , contents , config , ) for _ , part := range result . Candidates [ 0 ]. Content . Parts { if part . Text != "" { fmt . Println ( part . Text ) } else if part . InlineData != nil { imageBytes := part . InlineData . Data outputFilename := "gemini_generated_image.png" _ = os . WriteFile ( outputFilename , imageBytes , 0644 ) } } } REST IMG_PATH = /path/to/your/image1.jpeg if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMG_BASE64 = $( base64 " $B64FLAGS " " $IMG_PATH " 2>&1 ) curl -X POST \ "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-preview-image-generation:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d "{ \"contents\": [{ \"parts\":[ {\"text\": \"'Hi, This is a picture of me. Can you add a llama next to me\"}, { \"inline_data\": { \"mime_type\":\"image/jpeg\", \"data\": \" $IMG_BASE64 \" } } ] }], \"generationConfig\": {\"responseModalities\": [\"TEXT\", \"IMAGE\"]} }" \ | grep -o '"data": "[^"]*"' \ | cut -d '"' -f4 \ | base64 --decode > gemini-edited-image.png Other image generation modes Gemini supports other image interaction modes based on prompt structure and context, including: Text to image(s) and text (interleaved): Outputs images with related text. Example prompt: "Generate an illustrated recipe for a paella." Image(s) and text to image(s) and text (interleaved) : Uses input images and text to create new related images and text. Example prompt: (With an image of a furnished room) \ No newline at end of file diff --git a/docstore/a9b5ad2d-1aa8-43cd-adf2-6dd19087d8d2 b/docstore/a9b5ad2d-1aa8-43cd-adf2-6dd19087d8d2 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/a9b5ad2d-1aa8-43cd-adf2-6dd19087d8d2 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/a9f3835e-4f3c-411c-b656-9fd7b807b7e3 b/docstore/a9f3835e-4f3c-411c-b656-9fd7b807b7e3 new file mode 100644 index 0000000000000000000000000000000000000000..ddb7a9245d74e27120ebf722d781e6ffdbe95888 --- /dev/null +++ b/docstore/a9f3835e-4f3c-411c-b656-9fd7b807b7e3 @@ -0,0 +1 @@ +"gemini-1.5-flash" ) imgData , err := os . ReadFile ( "path/to/organ.jpg" ) if err != nil { log . Fatal ( err ) } resp , err := model . GenerateContent ( ctx , genai . Text ( "Tell me about this instrument" ), genai . ImageData ( "jpeg" , imgData )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) // utility for printing response After Python Many of the same convenience features exist in the new SDK. For example, PIL.Image objects are automatically converted. from google import genai from PIL import Image client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = [ 'Tell me a story based on this image' , Image . open ( image_path ) ] ) print ( response . text ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const organ = await ai . files . upload ({ file : "path/to/organ.jpg" , }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : [ createUserContent ([ "Tell me a story based on this image" , createPartFromUri ( organ . uri , organ . mimeType ) ]), ], }); console . log ( response . text ); Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imgData , err := os . ReadFile ( "path/to/organ.jpg" ) if err != nil { log . Fatal ( err ) } parts := [] * genai . Part { { Text : "Tell me a story based on this image" }, { InlineData : & genai . Blob { Data : imgData , MIMEType : "image/jpeg" }}, } contents := [] * genai . Content { { Parts : parts }, } result , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , contents , nil ) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing result Streaming Before Python import google.generativeai as genai response = model . generate_content ( "Write a cute story about cats." , stream = True ) for chunk in response : print ( chunk . text ) \ No newline at end of file diff --git a/docstore/a9ffe737-1389-4c96-b7d7-6ca56ff483ca b/docstore/a9ffe737-1389-4c96-b7d7-6ca56ff483ca new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/a9ffe737-1389-4c96-b7d7-6ca56ff483ca @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/aa188b02-c3df-4fe3-a031-f16bae08452b b/docstore/aa188b02-c3df-4fe3-a031-f16bae08452b new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/aa188b02-c3df-4fe3-a031-f16bae08452b @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/aa1eccd5-8de6-421a-aea6-5998bb56bffc b/docstore/aa1eccd5-8de6-421a-aea6-5998bb56bffc new file mode 100644 index 0000000000000000000000000000000000000000..7c3d98af9909034f92832ab4dcc3a7220e5c9856 --- /dev/null +++ b/docstore/aa1eccd5-8de6-421a-aea6-5998bb56bffc @@ -0,0 +1 @@ +operation = await ai . models . generateVideos ({ model : "veo-2.0-generate-001" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , image : { imageBytes : response . generatedImages [ 0 ]. image . imageBytes , // response from Imagen mimeType : "image/png" , }, config : { aspectRatio : "16:9" , numberOfVideos : 2 , }, }); while ( ! operation . done ) { await new Promise (( resolve ) = > setTimeout ( resolve , 10000 )); operation = await ai . operations . getVideosOperation ({ operation : operation , }); } operation . response ? . generatedVideos ? . forEach ( async ( generatedVideo , n ) = > { const resp = await fetch ( ` ${ generatedVideo . video ? . uri } & key=GEMINI_API_KEY` , // append your API key ); const writer = createWriteStream ( `video ${ n } .mp4` ); Readable . fromWeb ( resp . body ). pipe ( writer ); }); } main (); Go image := response . GeneratedImages [ 0 ]. Image videoConfig := & genai . GenerateVideosConfig { AspectRatio : "16:9" , NumberOfVideos : 2 , } operation , _ := client . Models . GenerateVideos ( ctx , "veo-2.0-generate-001" , "A dramatic scene based on the input image" , image , videoConfig , ) for ! operation . Done { time . Sleep ( 20 * time . Second ) operation , _ = client . Operations . GetVideosOperation ( ctx , operation , nil ) } for n , video := range operation . Response . GeneratedVideos { client . Files . Download ( ctx , video . Video , nil ) fname := fmt . Sprintf ( "video_with_image_input_%d.mp4" , n ) _ = os . WriteFile ( fname , video . Video . VideoBytes , 0644 ) } Veo model parameters (Naming conventions vary by programming language.) prompt : The text prompt for the video. When present, the image parameter is optional. image : The image to use as the first frame for the video. When present, the prompt parameter is optional. negativePrompt : Text string that describes anything you want to discourage the model from generating aspectRatio : Changes the aspect ratio of the generated video. \ No newline at end of file diff --git a/docstore/aa27954c-43d6-4365-a744-84429b1947c4 b/docstore/aa27954c-43d6-4365-a744-84429b1947c4 new file mode 100644 index 0000000000000000000000000000000000000000..2dd19cc1c2d77863f4a301d62456871308dd74fd --- /dev/null +++ b/docstore/aa27954c-43d6-4365-a744-84429b1947c4 @@ -0,0 +1 @@ +"temperature" ], }, }, ], }, ]; // Prompt for the model let contents = [ { role : "user" , parts : [ { text : "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , }, ], }, ]; // Loop until the model has no more function calls to make while ( true ) { const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents , config : { tools }, }); if ( result . functionCalls && result . functionCalls . length > 0 ) { const functionCall = result . functionCalls [ 0 ]; const { name , args } = functionCall ; if ( ! toolFunctions [ name ]) { throw new Error ( `Unknown function call: ${ name } ` ); } // Call the function and get the response. const toolResponse = toolFunctions [ name ]( args ); const functionResponsePart = { name : functionCall . name , response : { result : toolResponse , }, }; // Send the function response back to the model. contents . push ({ role : "model" , parts : [ { functionCall : functionCall , }, ], }); contents . push ({ role : "user" , parts : [ { functionResponse : functionResponsePart , }, ], }); } else { // No more function calls, break the loop. console . log ( result . text ); break ; } } Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . It 's 25°C in London, so I' ve set the thermostat to 20 ° C . Compositional function calling is a native Live API feature. This means Live API can handle the function calling similar to the Python SDK. Python # Light control schemas turn_on_the_lights_schema = { 'name' : 'turn_on_the_lights' } \ No newline at end of file diff --git a/docstore/aa36ccba-e69f-415c-a520-b688deda097b b/docstore/aa36ccba-e69f-415c-a520-b688deda097b new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/aa36ccba-e69f-415c-a520-b688deda097b @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/aa4f2a0d-0343-4f59-a2ff-ffa84cbb7bdf b/docstore/aa4f2a0d-0343-4f59-a2ff-ffa84cbb7bdf new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/aa4f2a0d-0343-4f59-a2ff-ffa84cbb7bdf @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/aa5229a9-e3ec-4ce1-8faf-728bad818377 b/docstore/aa5229a9-e3ec-4ce1-8faf-728bad818377 new file mode 100644 index 0000000000000000000000000000000000000000..c2369ca5049154f630fe926e06160c0364720f7c --- /dev/null +++ b/docstore/aa5229a9-e3ec-4ce1-8faf-728bad818377 @@ -0,0 +1 @@ +const turns = await handleTurn (); // Combine audio data strings and save as wave file const combinedAudio = turns . reduce (( acc , turn ) = > { if ( turn . data ) { const buffer = Buffer . from ( turn . data , 'base64' ); const intArray = new Int16Array ( buffer . buffer , buffer . byteOffset , buffer . byteLength / Int16Array . BYTES_PER_ELEMENT ); return acc . concat ( Array . from ( intArray )); } return acc ; }, []); const audioBuffer = new Int16Array ( combinedAudio ); const wf = new WaveFile (); wf . fromScratch ( 1 , 24000 , '16' , audioBuffer ); // output is 24kHz fs . writeFileSync ( 'audio.wav' , wf . toBuffer ()); session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); What's next Read the full Live API Capabilities guide for key capabilities and configurations; including Voice Activity Detection and native audio features. Read the Tool use guide to learn how to integrate Live API with tools and function calling. Read the Session management guide for managing long running conversations. Read the Ephemeral tokens guide for secure authentication in client-to-server applications. For more information about the underlying WebSockets API, see the WebSockets API reference . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/aa772e49-95ca-4318-b596-ca39b5fa603a b/docstore/aa772e49-95ca-4318-b596-ca39b5fa603a new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/aa772e49-95ca-4318-b596-ca39b5fa603a @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/aa7b3eb1-47e6-4d75-b5e5-46257dae7e54 b/docstore/aa7b3eb1-47e6-4d75-b5e5-46257dae7e54 new file mode 100644 index 0000000000000000000000000000000000000000..f71ac6c85727e3c520290c703b52e420cb1baa33 --- /dev/null +++ b/docstore/aa7b3eb1-47e6-4d75-b5e5-46257dae7e54 @@ -0,0 +1 @@ +(JSONL) file. Each line in this file must be a JSON object containing a user-defined key and a request object, where the request is a valid GenerateContentRequest object. The user-defined key is used in the response to indicate which output is the result of which request. For example, the request with the key defined as request-1 will have its response annotated with the same key name. This file is uploaded using the File API . The maximum allowed file size for an input file is 2GB. The following is an example of a JSONL file. You can save it in a file named my-batch-requests.json : { "key" : "request-1" , "request" : { "contents" : [{ "parts" : [{ "text" : "Describe the process of photosynthesis." }]}], "generation_config" : { "temperature" : 0.7 }}} { "key" : "request-2" , "request" : { "contents" : [{ "parts" : [{ "text" : "What are the main ingredients in a Margherita pizza?" }]}]}} Similarly to inline requests, you can specify other parameters like system instructions, tools or other configurations in each request JSON. You can upload this file using the File API as shown in the following example. If you are working with multimodal input, you can reference other uploaded files within your JSONL file. Python from google import genai from google.genai import types client = genai . Client () # Create a sample JSONL file with open ( "my-batch-requests.jsonl" , "w" ) as f : requests = [ { "key" : "request-1" , "request" : { "contents" : [{ "parts" : [{ "text" : "Describe the process of photosynthesis." }]}]}}, { "key" : "request-2" , "request" : { "contents" : [{ "parts" : [{ "text" : "What are the main ingredients in a Margherita pizza?" }]}]}} ] for req in requests : f . write ( json . dumps ( req ) + " \n " ) # Upload the file to the File API uploaded_file = client . files . upload ( file = 'my-batch-requests.jsonl' , config = types . UploadFileConfig ( display_name = 'my-batch-requests' , mime_type = 'jsonl' ) ) print ( f "Uploaded file: { uploaded_file . name } \ No newline at end of file diff --git a/docstore/aa8830e6-bfb3-41f5-86f1-88246e72c28d b/docstore/aa8830e6-bfb3-41f5-86f1-88246e72c28d new file mode 100644 index 0000000000000000000000000000000000000000..4542d68c23b70ac5014267f60edbcffd2138877f --- /dev/null +++ b/docstore/aa8830e6-bfb3-41f5-86f1-88246e72c28d @@ -0,0 +1 @@ +, headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Streaming can also be used for multi-turn conversations. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message_stream ( "I have 2 dogs in my house." ) for chunk in response : print ( chunk . text , end = "" ) response = chat . send_message_stream ( "How many paws are in my house?" ) for chunk in response : print ( chunk . text , end = "" ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const stream1 = await chat . sendMessageStream ({ message : "I have 2 dogs in my house." , }); for await ( const chunk of stream1 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } const stream2 = await chat . sendMessageStream ({ message : "How many paws are in my house?" , }); for await ( const chunk of stream2 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . \ No newline at end of file diff --git a/docstore/aa9a274c-68bd-40d8-abd6-9c13aba1838d b/docstore/aa9a274c-68bd-40d8-abd6-9c13aba1838d new file mode 100644 index 0000000000000000000000000000000000000000..10d595bd2c735f8912abb00e69220b9ae90d3d23 --- /dev/null +++ b/docstore/aa9a274c-68bd-40d8-abd6-9c13aba1838d @@ -0,0 +1 @@ +Audio understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Audio understanding Gemini can analyze and understand audio input, enabling use cases like the following: Describe, summarize, or answer questions about audio content. Provide a transcription of the audio. Analyze specific segments of the audio. This guide shows you how to use the Gemini API to generate a text response to audio input. Before you begin Before calling the Gemini API, ensure you have your SDK of choice installed, and a Gemini API key configured and ready to use. Input audio You can provide audio data to Gemini in the following ways: Upload an audio file before making a request to generateContent . Pass inline audio data with the request to generateContent . Upload an audio file You can use the Files API to upload an audio file. Always use the Files API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20 MB. The following code uploads an audio file and then uses the file in a call to generateContent . Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp3" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "Describe this audio clip" , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mp3" }, }); const response = await ai . models . generateContent ({ \ No newline at end of file diff --git a/docstore/aa9d7ccf-f75d-434d-b776-11fa0982c6ef b/docstore/aa9d7ccf-f75d-434d-b776-11fa0982c6ef new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/aa9d7ccf-f75d-434d-b776-11fa0982c6ef @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/aaa25c1e-dcdc-4859-81d4-82ce6203782a b/docstore/aaa25c1e-dcdc-4859-81d4-82ce6203782a new file mode 100644 index 0000000000000000000000000000000000000000..c3dd5ccab4f70d7a2a8a4ffb78dc41dee62c1a3f --- /dev/null +++ b/docstore/aaa25c1e-dcdc-4859-81d4-82ce6203782a @@ -0,0 +1 @@ +with texts to make sense, use whatever order is most natural. Troubleshooting your multimodal prompt If the model is not drawing information from the relevant part of the image: Drop hints with which aspects of the image you want the prompt to draw information from. If the model output is too generic (not tailored enough to the image/video input): At the start of the prompt, try asking the model to describe the image(s) or video before providing the task instruction, or try asking the model to refer to what's in the image. To troubleshoot which part failed: Ask the model to describe the image, or ask the model to explain its reasoning, to gauge the model's initial understanding. If your prompt results in hallucinated content: Try dialing down the temperature setting or asking the model for shorter descriptions so that it's less likely to extrapolate additional details. Tuning the sampling parameters: Experiment with different temperature settings and top-k selections to adjust the model's creativity. Be specific in your instructions Prompts have the most success when they are clear and detailed. If you have a specific output in mind, it's better to include that requirement in the prompt to ensure you get the output you want. For this image of an airport board, asking the model to just "describe this image" could generate a general description. If you need the model to parse the time and city from the image, you can include that request directly in your prompt. Prompt Model response Describe this image. The image shows an airport arrivals and departures board. Updated prompt Improved response Parse the time and city from the airport board shown in this image into a list. 10:50 Moscow 11:05 Edinburgh 11:05 London 11:10 Bucharest 11:30 Kiev 11:35 Dublin 11:45 East Midlands 12:15 Sofia 12:30 London 12:30 Newcastle 12:40 St Petersburg 12:40 London 12:45 Manchester Add a few examples The Gemini model can accept multiple inputs which it can use as examples to understand \ No newline at end of file diff --git a/docstore/aaa85c90-5ad4-4776-8512-6bd972b1a90a b/docstore/aaa85c90-5ad4-4776-8512-6bd972b1a90a new file mode 100644 index 0000000000000000000000000000000000000000..a70be0d04647e3405f67cb87b3156d8922b7c210 --- /dev/null +++ b/docstore/aaa85c90-5ad4-4776-8512-6bd972b1a90a @@ -0,0 +1 @@ +to the examples provides labels that the model can use when generating the output, which makes it easier to parse output content. In the following example, "Text:" is the input prefix and "The answer is:" is the output prefix. Prompt: Classify the text as one of the following categories. - large - small Text: Rhino The answer is: large Text: Mouse The answer is: small Text: Snail The answer is: small Text: Elephant The answer is: Response: The answer is: large (gemini-2.5-flash) Break down prompts into components For use cases that require complex prompts, you can help the model manage this complexity by breaking things down into simpler components. Break down instructions: Instead of having many instructions in one prompt, create one prompt per instruction. You can choose which prompt to process based on the user's input. Chain prompts: For complex tasks that involve multiple sequential steps, make each step a prompt and chain the prompts together in a sequence. In this sequential chain of prompts, the output of one prompt in the sequence becomes the input of the next prompt. The output of the last prompt in the sequence is the final output. Aggregate responses: Aggregation is when you want to perform different parallel tasks on different portions of the data and aggregate the results to produce the final output. For example, you can tell the model to perform one operation on the first part of the data, perform another operation on the rest of the data and aggregate the results. Experiment with model parameters Each call that you send to a model includes parameter values that control how the model generates a response. The model can generate different results for different parameter values. Experiment with different parameter values to get the best values for the task. The parameters available for different models may differ. The most common parameters are the following: Max output tokens: Specifies the maximum number of tokens that can be generated in the \ No newline at end of file diff --git a/docstore/aaab7be7-8677-4997-990c-f6cfb7d14364 b/docstore/aaab7be7-8677-4997-990c-f6cfb7d14364 new file mode 100644 index 0000000000000000000000000000000000000000..cc967ffee72bfb803a27c22b45a7304a203928ac --- /dev/null +++ b/docstore/aaab7be7-8677-4997-990c-f6cfb7d14364 @@ -0,0 +1 @@ +the model to parse the problem before answering the prompt. Prompt Response Parse the formula in the image first. Then based on the formula, answer "what's the 4th term in the sequence?". Click to expand the result $$ b(n) = b(n-1) * (-3) $$ $$ b(1) = 15 $$ Substitute n = 4 into the formula: $$ b(4) = b(4-1) * (-3) $$ $$ b(4) = b(3) * (-3) $$ We don't know the value of b(3), so substitute n = 3 into the formula: $$ b(3) = b(3-1) * (-3) $$ $$ b(3) = b(2) * (-3) $$ We don't know the value of b(2), so substitute n = 2 into the formula: $$ b(2) = b(2-1) * (-3) $$ $$ b(2) = b(1) * (-3) $$ We know the value of b(1) is 15, so substitute it into the formula: $$ b(2) = 15 * (-3) $$ $$ b(2) = -45 $$ Now that we know the value of b(2), substitute it into the formula for b(3): $$ b(3) = b(2) * (-3) $$ $$ b(3) = -45 * (-3) $$ $$ b(3) = 135 $$ Now that we know the value of b(3), substitute it into the formula for b(4): $$ b(4) = b(3) * (-3) $$ $$ b(4) = 135 * (-3) $$ $$ b(4) = -405 $$ Therefore, the 4th term in the sequence is -405. This model response is correct - now to make the response even more streamlined, you can ask the model to to think step-by-step. Updated prompt Improved response What is the 4th term in the sequence? Think step by step. The first term is 15. The second term is b(1) * (-3) = 15 * (-3) = -45. The third term is b(2) * (-3) = -45 * (-3) = 135. The fourth term is b(3) * (-3) = 135 * (-3) = -405. Specify the output format A common problem is the need for model output to be in a certain format (e.g. markdown, JSON, HTML), especially if the model output needs to be ingested by a downstream task. You can try to produce output in that format by instructing the model to do so within the prompt. Below are two examples: Prompt Model response Parse the table in this image into markdown format | Subject | Status | Time | Start date | Due on | | ------- | ------ | ---- | ---------- | ------ | | Math | Done | 1.5 hrs | 11/29/2023 | 12/1/2023 | | Geography | Done | 30 \ No newline at end of file diff --git a/docstore/aab99d50-20c7-425d-8daf-60594bff4c0f b/docstore/aab99d50-20c7-425d-8daf-60594bff4c0f new file mode 100644 index 0000000000000000000000000000000000000000..69f7399c35aaaad68e1bd1a996c44353577b3a79 --- /dev/null +++ b/docstore/aab99d50-20c7-425d-8daf-60594bff4c0f @@ -0,0 +1 @@ +the user. if message . server_content and message . server_content . turn_complete : break if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } console . debug ( 'Connecting to the service with handle %s...' , previousSessionHandle ) const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : { responseModalities : [ Modality . TEXT ], sessionResumption : { handle : previousSessionHandle } // The handle of the session to resume is passed here, or else null to start a new session. } }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . sessionResumptionUpdate ) { if ( turn . sessionResumptionUpdate . resumable && turn . sessionResumptionUpdate . newHandle ) { let newHandle = turn . sessionResumptionUpdate . newHandle // ...Store newHandle and start new session with this handle here } } } session . close (); } async function main () { await \ No newline at end of file diff --git a/docstore/aacdd2bd-8a4e-405e-b735-1678d0657258 b/docstore/aacdd2bd-8a4e-405e-b735-1678d0657258 new file mode 100644 index 0000000000000000000000000000000000000000..6a0f5762f2e47222d475421a2613ce0f732fa260 --- /dev/null +++ b/docstore/aacdd2bd-8a4e-405e-b735-1678d0657258 @@ -0,0 +1 @@ +in the Gemini API by setting clipping intervals or providing custom frame rate sampling. Tip: Video clipping and frames per second (FPS) are supported by all models, but the quality is significantly higher from 2.5 series models. Set clipping intervals You can clip video by specifying videoMetadata with start and end offsets. Python response = client . models . generate_content ( model = 'models/gemini-2.5-flash-preview-05-20' , contents = types . Content ( parts = [ types . Part ( file_data = types . FileData ( file_uri = 'https://www.youtube.com/watch?v=XEzRZ35urlk' ), video_metadata = types . VideoMetadata ( start_offset = '1250s' , end_offset = '1570s' ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) Set a custom frame rate You can set custom frame rate sampling by passing an fps argument to videoMetadata . Python # Only for videos of size <20Mb video_file_name = "/path/to/your/video.mp4" video_bytes = open ( video_file_name , 'rb' ) . read () response = client . models . generate_content ( model = 'models/gemini-2.5-flash-preview-05-20' , contents = types . Content ( parts = [ types . Part ( inline_data = types . Blob ( data = video_bytes , mime_type = 'video/mp4' ), video_metadata = types . VideoMetadata ( fps = 5 ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) By default 1 frame per second (FPS) is sampled from the video. You might want to set low FPS (< 1) for long videos. This is especially useful for mostly static videos (e.g. lectures). If you want to capture more details in rapidly changing visuals, consider setting a higher FPS value. Supported video formats Gemini supports the following video format MIME types: video/mp4 video/mpeg video/mov video/avi video/x-flv video/mpg video/webm video/wmv video/3gpp Technical details about videos Supported models & context : All Gemini 2.0 and 2.5 models can process video data. Models with a 2M context window can process videos up to 2 hours long at \ No newline at end of file diff --git a/docstore/aadde207-b595-4b6d-b44a-e222b205ca9c b/docstore/aadde207-b595-4b6d-b44a-e222b205ca9c new file mode 100644 index 0000000000000000000000000000000000000000..8450328bfc49bb62b82cf9cb2ac9ca979a8f4eef --- /dev/null +++ b/docstore/aadde207-b595-4b6d-b44a-e222b205ca9c @@ -0,0 +1 @@ +Python import asyncio import wave from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : wf = wave . open ( "audio.wav" , "wb" ) wf . setnchannels ( 1 ) wf . setsampwidth ( 2 ) wf . setframerate ( 24000 ) message = "Hello how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . data is not None : wf . writeframes ( response . data ) # Un-comment this code to print audio data info # if response.server_content.model_turn is not None: # print(response.server_content.model_turn.parts[0].inline_data.mime_type) wf . close () if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : \ No newline at end of file diff --git a/docstore/aaf70cf6-0cc8-4ee9-a086-79b08bdb62c7 b/docstore/aaf70cf6-0cc8-4ee9-a086-79b08bdb62c7 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/aaf70cf6-0cc8-4ee9-a086-79b08bdb62c7 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/ab006f09-616a-48b6-9807-1dd7256b476c b/docstore/ab006f09-616a-48b6-9807-1dd7256b476c new file mode 100644 index 0000000000000000000000000000000000000000..a99dcaa8b13f3b83504ad1b35c796f8fc297615c --- /dev/null +++ b/docstore/ab006f09-616a-48b6-9807-1dd7256b476c @@ -0,0 +1 @@ +] # Execute the prompt with specified tools in audio modality await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Multiple tasks example - combining lights, code execution, and search const prompt = ` Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! ` ; const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } // not defined here. ]; // Execute the prompt with specified tools in audio modality await run ( prompt , { tools : tools , modality : "AUDIO" }); Python developers can try this out in the Live API Tool Use notebook . Model context protocol (MCP) Model Context Protocol (MCP) is an open standard for connecting AI applications with external tools and data. MCP provides a common protocol for models to access context, such as functions (tools), data sources (resources), or predefined prompts. The Gemini SDKs have built-in support for the MCP, reducing boilerplate code and offering automatic tool calling for MCP tools. When the model generates an MCP tool call, the Python and JavaScript client SDK can automatically execute the MCP tool and send the response back to the model in a subsequent request, continuing this loop until no more tool calls are made by the model. Here, you can find an example of how to use a local MCP server with Gemini and mcp SDK. Python Make sure the latest version of the mcp SDK is installed on your platform of choice. pip install mcp Note: Python supports automatic tool calling by passing in the ClientSession into the tools parameters. If you want to disable it, you can provide automatic_function_calling with disabled True . import os import asyncio from datetime import datetime from mcp import ClientSession , StdioServerParameters from \ No newline at end of file diff --git a/docstore/ab02792f-07d6-4ab2-882f-52fe57533471 b/docstore/ab02792f-07d6-4ab2-882f-52fe57533471 new file mode 100644 index 0000000000000000000000000000000000000000..aa68cf0548cfb2585cc033502540f8f159dbfde7 --- /dev/null +++ b/docstore/ab02792f-07d6-4ab2-882f-52fe57533471 @@ -0,0 +1 @@ +Gemini API has built-in protections against core harms, such as content that endangers child safety. These types of harm are always blocked and cannot be adjusted. Content safety filtering level The Gemini API categorizes the probability level of content being unsafe as HIGH , MEDIUM , LOW , or NEGLIGIBLE . The Gemini API blocks content based on the probability of content being unsafe and not the severity. This is important to consider because some content can have low probability of being unsafe even though the severity of harm could still be high. For example, comparing the sentences: The robot punched me. The robot slashed me up. The first sentence might result in a higher probability of being unsafe, but you might consider the second sentence to be a higher severity in terms of violence. Given this, it is important that you carefully test and consider what the appropriate level of blocking is needed to support your key use cases while minimizing harm to end users. Safety filtering per request You can adjust the safety settings for each request you make to the API. When you make a request, the content is analyzed and assigned a safety rating. The safety rating includes the category and the probability of the harm classification. For example, if the content was blocked due to the harassment category having a high probability, the safety rating returned would have category equal to HARASSMENT and harm probability set to HIGH . By default, safety settings block content (including prompts) with medium or higher probability of being unsafe across any filter. This baseline safety is designed to work for most use cases, so you should only adjust your safety settings if it's consistently required for your application. The following table describes the block settings you can adjust for each category. For example, if you set the block setting to Block few for the Hate speech category, everything that has a high probability of being hate speech content is blocked. But \ No newline at end of file diff --git a/docstore/ab04819b-747e-4f98-8b4d-90589d959e36 b/docstore/ab04819b-747e-4f98-8b4d-90589d959e36 new file mode 100644 index 0000000000000000000000000000000000000000..7e8ad5dab4abf30afa5f2a7a1bb2bb58fcb2f5d0 --- /dev/null +++ b/docstore/ab04819b-747e-4f98-8b4d-90589d959e36 @@ -0,0 +1 @@ +-X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"' " $MIME_TYPE " '", "data": "' " $IMAGE_B64 " '" } }, {"text": "Caption this image."} ] }] }' 2 > /dev/null Note: Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly. Uploading images using the File API For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to generateContent . See the Files API guide for more information and examples. Python from google import genai client = genai . Client () my_file = client . files . upload ( file = "path/to/sample.jpg" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ my_file , "Caption this image." ], ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.jpg" , config : { mimeType : "image/jpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Caption this image." , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.jpg" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Caption this image." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * \ No newline at end of file diff --git a/docstore/ab2546d9-e3aa-483a-ad0f-110407e4e792 b/docstore/ab2546d9-e3aa-483a-ad0f-110407e4e792 new file mode 100644 index 0000000000000000000000000000000000000000..3ecdc47bdfdb376a1d8226f76a3e20fc1fff4015 --- /dev/null +++ b/docstore/ab2546d9-e3aa-483a-ad0f-110407e4e792 @@ -0,0 +1 @@ +"createTunedModel" : print ( m . name ) break # create tuning model training_dataset = types . TuningDataset ( examples = [ types . TuningExample ( text_input = f 'input { i } ' , output = f 'output { i } ' , ) for i in range ( 5 ) ], ) tuning_job = client . tunings . tune ( base_model = 'models/gemini-1.5-flash-001-tuning' , training_dataset = training_dataset , config = types . CreateTuningJobConfig ( epoch_count = 5 , batch_size = 4 , learning_rate = 0.001 , tuned_model_display_name = "test tuned model" ) ) # generate content with the tuned model response = client . models . generate_content ( model = tuning_job . tuned_model . model , contents = '55' , ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/docstore/ab28176a-fc12-42c6-a33b-98b434313dde b/docstore/ab28176a-fc12-42c6-a33b-98b434313dde new file mode 100644 index 0000000000000000000000000000000000000000..5c08e584262d5f95532435a85f2ee4b9a401ebd8 --- /dev/null +++ b/docstore/ab28176a-fc12-42c6-a33b-98b434313dde @@ -0,0 +1 @@ +'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } What's next Now that you made your first API request, you might want to explore the following guides that show Gemini in action: Thinking Text generation Vision Long context Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/ab4edd34-66ca-43e2-a181-f71019902b04 b/docstore/ab4edd34-66ca-43e2-a181-f71019902b04 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/ab4edd34-66ca-43e2-a181-f71019902b04 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/ab83669c-4778-4a48-a902-b64052157311 b/docstore/ab83669c-4778-4a48-a902-b64052157311 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/ab83669c-4778-4a48-a902-b64052157311 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/ab83a002-ff09-4409-b8bc-bf4b6eb68687 b/docstore/ab83a002-ff09-4409-b8bc-bf4b6eb68687 new file mode 100644 index 0000000000000000000000000000000000000000..8ae055ee25ee10e0bf5368b2d5c01f7fd2abd6b2 --- /dev/null +++ b/docstore/ab83a002-ff09-4409-b8bc-bf4b6eb68687 @@ -0,0 +1 @@ +model can then retrieve content from the URLs and use that content to inform and shape its response. You can try examples of using tools with thinking models in the Thinking cookbook . What's next? To work through more in depth examples, like: Using tools with thinking Streaming with thinking Adjusting the thinking budget for different results and more, try our Thinking cookbook . Thinking coverage is now available in our OpenAI Compatibility guide. For more info about Gemini 2.5 Pro, Gemini Flash 2.5, and Gemini 2.5 Flash-Lite, visit the model page . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-11 UTC. \ No newline at end of file diff --git a/docstore/ab87f37d-da36-4283-8449-4a5afc121d27 b/docstore/ab87f37d-da36-4283-8449-4a5afc121d27 new file mode 100644 index 0000000000000000000000000000000000000000..2bc9ee1b64943d2fc9ee4b66d281a35e0e278a02 --- /dev/null +++ b/docstore/ab87f37d-da36-4283-8449-4a5afc121d27 @@ -0,0 +1 @@ +Session management with Live API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Session management with Live API In the Live API, a session refers to a persistent connection where input and output are streamed continuously over the same connection (read more about how it works ). This unique session design enables low latency and supports unique features, but can also introduce challenges, like session time limits, and early termination. This guide covers strategies for overcoming the session management challenges that can arise when using the Live API. Session lifetime Without compression, audio-only sessions are limited to 15 minutes, and audio-video sessions are limited to 2 minutes. Exceeding these limits will terminate the session (and therefore, the connection), but you can use context window compression to extend sessions to an unlimited amount of time. The lifetime of a connection is limited as well, to around 10 minutes. When the connection terminates, the session terminates as well. In this case, you can configure a single session to stay active over multiple connections using session resumption . You'll also receive a GoAway message before the connection ends, allowing you to take further actions. Context window compression To enable longer sessions, and avoid abrupt connection termination, you can enable context window compression by setting the contextWindowCompression field as part of the session configuration. In the ContextWindowCompressionConfig , you can configure a sliding-window mechanism and the number of tokens that triggers compression. Python from google.genai import types config = types . LiveConnectConfig ( \ No newline at end of file diff --git a/docstore/ab97deea-27ef-415e-ab7f-e0e0835e2efa b/docstore/ab97deea-27ef-415e-ab7f-e0e0835e2efa new file mode 100644 index 0000000000000000000000000000000000000000..a93f111d87cca3d0226fc4cdf96775703aa03950 --- /dev/null +++ b/docstore/ab97deea-27ef-415e-ab7f-e0e0835e2efa @@ -0,0 +1 @@ +genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMAGE_PATH = "path/to/sample.jpg" MIME_TYPE = $( file -b --mime-type " ${ IMAGE_PATH } " ) NUM_BYTES = $( wc -c < " ${ IMAGE_PATH } " ) DISPLAY_NAME = IMAGE tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Caption this image."}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Prompting with multiple images You can provide multiple images in a \ No newline at end of file diff --git a/docstore/abab8598-8c24-43d7-89ab-4b1e7ef8bc85 b/docstore/abab8598-8c24-43d7-89ab-4b1e7ef8bc85 new file mode 100644 index 0000000000000000000000000000000000000000..5b31a2c588785b0dc19769f45b0589a09f2843d3 --- /dev/null +++ b/docstore/abab8598-8c24-43d7-89ab-4b1e7ef8bc85 @@ -0,0 +1 @@ +world knowledge and reasoning. Seamlessly blending text and images is important. You want accurate visuals embedded within long text sequences. You want to edit images conversationally while maintaining context. Choose Imagen when: Image quality, photorealism, artistic detail, or specific styles (e.g., impressionism, anime) are top priorities. Performing specialized editing tasks like product background updates or image upscaling. Infusing branding, style, or generating logos and product designs. Imagen 4 should be your go-to model starting to generate images with Imagen. Choose Imagen 4 Ultra for advanced use-cases or when you need the best image quality. Note that Imagen 4 Ultra can only generate one image at a time. Imagen prompt guide This section of the Imagen guide shows you how modifying a text-to-image prompt can produce different results, along with examples of images you can create. Prompt writing basics Note: Maximum prompt length is 480 tokens. A good prompt is descriptive and clear, and makes use of meaningful keywords and modifiers. Start by thinking of your subject , context , and style . Image text: A sketch ( style ) of a modern apartment building ( subject ) surrounded by skyscrapers ( context and background ). Subject : The first thing to think about with any prompt is the subject : the object, person, animal, or scenery you want an image of. Context and background: Just as important is the background or context in which the subject will be placed. Try placing your subject in a variety of backgrounds. For example, a studio with a white background, outdoors, or indoor environments. Style: Finally, add the style of image you want. Styles can be general (painting, photograph, sketches) or very specific (pastel painting, charcoal drawing, isometric 3D). You can also combine styles. After you write a first version of your prompt, refine your prompt by adding more details until you get to the image that you want. Iteration is important. Start by \ No newline at end of file diff --git a/docstore/abac0c80-ea27-4049-a129-d180c64b98d3 b/docstore/abac0c80-ea27-4049-a129-d180c64b98d3 new file mode 100644 index 0000000000000000000000000000000000000000..203c1e44cd8be8f0144d13a11f43fb822930ab15 --- /dev/null +++ b/docstore/abac0c80-ea27-4049-a129-d180c64b98d3 @@ -0,0 +1 @@ +] // MCP Server }); const client = new Client ( { name : "example-client" , version : "1.0.0" } ); // Configure the client const ai = new GoogleGenAI ({}); // Initialize the connection between client and server await client . connect ( serverParams ); // Send request to the model with MCP tools const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : `What is the weather in London in ${ new Date (). toLocaleDateString () } ?` , config : { tools : [ mcpToTool ( client )], // uses the session, will automatically call the tool // Uncomment if you **don't** want the sdk to automatically call the tool // automaticFunctionCalling: { // disable: true, // }, }, }); console . log ( response . text ) // Close the connection await client . close (); Limitations with built-in MCP support Built-in MCP support is a experimental feature in our SDKs and has the following limitations: Only tools are supported, not resources nor prompts It is available for the Python and JavaScript/TypeScript SDK. Breaking changes might occur in future releases. Manual integration of MCP servers is always an option if these limit what you're building. Supported models This section lists models and their function calling capabilities. Experimental models are not included. You can find a comprehensive capabilities overview on the model overview page. Model Function Calling Parallel Function Calling Compositional Function Calling Gemini 2.5 Pro ✔️ ✔️ ✔️ Gemini 2.5 Flash ✔️ ✔️ ✔️ Gemini 2.5 Flash-Lite ✔️ ✔️ ✔️ Gemini 2.0 Flash ✔️ ✔️ ✔️ Gemini 2.0 Flash-Lite X X X Best practices Function and Parameter Descriptions: Be extremely clear and specific in your descriptions. The model relies on these to choose the correct function and provide appropriate arguments. Naming: Use descriptive function names (without spaces, periods, or dashes). Strong Typing: Use specific types (integer, string, enum) for parameters to reduce errors. If a parameter has a limited set of valid \ No newline at end of file diff --git a/docstore/abc62528-f62a-4693-8f8d-cbd24f0192fd b/docstore/abc62528-f62a-4693-8f8d-cbd24f0192fd new file mode 100644 index 0000000000000000000000000000000000000000..d464a7e5141c7bcc5fa86ba919979db27614ba5c --- /dev/null +++ b/docstore/abc62528-f62a-4693-8f8d-cbd24f0192fd @@ -0,0 +1 @@ +Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/abd158dd-5498-4435-acb7-01521fd8a5b7 b/docstore/abd158dd-5498-4435-acb7-01521fd8a5b7 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/abd158dd-5498-4435-acb7-01521fd8a5b7 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/abd53353-faa7-47c9-a974-94d954ce403d b/docstore/abd53353-faa7-47c9-a974-94d954ce403d new file mode 100644 index 0000000000000000000000000000000000000000..4a8a7222dfc27acfaa73b21a084913914a78851b --- /dev/null +++ b/docstore/abd53353-faa7-47c9-a974-94d954ce403d @@ -0,0 +1 @@ +"fmt" "google.golang.org/genai" "os" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := genai . Text ( "What is the sum of the first 50 prime numbers?" ) model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { IncludeThoughts : true , }, }) for _ , part := range resp . Candidates [ 0 ]. Content . Parts { if part . Text != "" { if part . Thought { fmt . Println ( "Thoughts Summary:" ) fmt . Println ( part . Text ) } else { fmt . Println ( "Answer:" ) fmt . Println ( part . Text ) } } } } And here is an example using thinking with streaming, which returns rolling, incremental summaries during generation: Python from google import genai from google.genai import types client = genai . Client () prompt = """ Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own? """ thoughts = "" answer = "" for chunk in client . models . generate_content_stream ( model = "gemini-2.5-pro" , contents = prompt , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( include_thoughts = True ) ) ): for part in chunk . candidates [ 0 ] . content . parts : if not part . text : continue elif part . thought : if not thoughts : print ( "Thoughts summary:" ) print ( part . text ) thoughts += part . text else : if not answer : print ( "Answer:" ) print ( part . text ) answer += part . text JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const prompt = `Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The \ No newline at end of file diff --git a/docstore/ac140b3a-3d78-4f1a-859b-e13bcc621e9f b/docstore/ac140b3a-3d78-4f1a-859b-e13bcc621e9f new file mode 100644 index 0000000000000000000000000000000000000000..10818600c4a983fe2afdf5abfcdbca758413b32e --- /dev/null +++ b/docstore/ac140b3a-3d78-4f1a-859b-e13bcc621e9f @@ -0,0 +1 @@ +the files the user uploads and pay to store them on a per hour basis. The input / output cost per request with Gemini Flash for example is ~4x less than the standard input / output cost, so if the user chats with their data enough, it becomes a huge cost saving for you as the developer. Long context limitations In various sections of this guide, we talked about how Gemini models achieve high performance across various needle-in-a-haystack retrieval evals. These tests consider the most basic setup, where you have a single needle you are looking for. In cases where you might have multiple "needles" or specific pieces of information you are looking for, the model does not perform with the same accuracy. Performance can vary to a wide degree depending on the context. This is important to consider as there is an inherent tradeoff between getting the right information retrieved and cost. You can get ~99% on a single query, but you have to pay the input token cost every time you send that query. So for 100 pieces of information to be retrieved, if you needed 99% performance, you would likely need to send 100 requests. This is a good example of where context caching can significantly reduce the cost associated with using Gemini models while keeping the performance high. FAQs Where is the best place to put my query in the context window? In most cases, especially if the total context is long, the model's performance will be better if you put your query / question at the end of the prompt (after all the other context). Do I lose model performance when I add more tokens to a query? Generally, if you don't need tokens to be passed to the model, it is best to avoid passing them. However, if you have a large chunk of tokens with some information and want to ask questions about that information, the model is highly capable of extracting that information (up to 99% accuracy in many cases). How can I lower my cost with long-context queries? If you have a similar set of tokens / \ No newline at end of file diff --git a/docstore/ac1e71ff-d1e6-48f6-ac88-46bcdbc91a27 b/docstore/ac1e71ff-d1e6-48f6-ac88-46bcdbc91a27 new file mode 100644 index 0000000000000000000000000000000000000000..d132d107f5d9545892f17ff23883cb4c3477bdc5 --- /dev/null +++ b/docstore/ac1e71ff-d1e6-48f6-ac88-46bcdbc91a27 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/image-understanding#image-input Title: Image understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/ac280217-809c-452a-8c08-3113964e3103 b/docstore/ac280217-809c-452a-8c08-3113964e3103 new file mode 100644 index 0000000000000000000000000000000000000000..54ee11bfb756db29fb776eb5a6d4247407dfa205 --- /dev/null +++ b/docstore/ac280217-809c-452a-8c08-3113964e3103 @@ -0,0 +1 @@ +(Default/Some Thinking): Many common requests benefit from a degree of step-by-step processing or deeper understanding. Gemini can flexibly use thinking capability for tasks like: Analogize photosynthesis and growing up. Compare and contrast electric cars and hybrid cars. Hard Tasks (Maximum Thinking Capability): For truly complex challenges, such as solving complex math problems or coding tasks, we recommend setting a high thinking budget. These types of tasks require the model to engage its full reasoning and planning capabilities, often involving many internal steps before providing an answer. Examples include: Solve problem 1 in AIME 2025: Find the sum of all integer bases b > 9 for which 17 b is a divisor of 97 b . Write Python code for a web application that visualizes real-time stock market data, including user authentication. Make it as efficient as possible. Thinking with tools and capabilities Thinking models work with all of Gemini's tools and capabilities. This allows the models to interact with external systems, execute code, or access real-time information, incorporating the results into their reasoning and final response. The search tool allows the model to query Google Search to find up-to-date information or information beyond its training data. This is useful for questions about recent events or highly specific topics. The code execution tool enables the model to generate and run Python code to perform calculations, manipulate data, or solve problems that are best handled algorithmically. The model receives the code's output and can use it in its response. With structured output , you can constrain Gemini to respond with JSON. This is particularly useful for integrating the model's output into applications. Function calling connects the thinking model to external tools and APIs, so it can reason about when to call the right function and what parameters to provide. URL Context provides the model with URLs as additional context for your prompt. The \ No newline at end of file diff --git a/docstore/ac305118-cecc-4da5-9bbb-e0da702c2bdf b/docstore/ac305118-cecc-4da5-9bbb-e0da702c2bdf new file mode 100644 index 0000000000000000000000000000000000000000..6d0fb3dda30537d366b10ce141412a20984aa796 --- /dev/null +++ b/docstore/ac305118-cecc-4da5-9bbb-e0da702c2bdf @@ -0,0 +1 @@ +where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels. """ config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # set thinking_budget to 0 for better results in object detection ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ prompt , im ], # Pillow images can be directly passed as inputs (which will be converted by the SDK) config = config ) # Parse JSON response items = json . loads ( parse_json ( response . text )) # Create output directory os . makedirs ( output_dir , exist_ok = True ) # Process each mask for i , item in enumerate ( items ): # Get bounding box coordinates box = item [ "box_2d" ] y0 = int ( box [ 0 ] / 1000 * im . size [ 1 ]) x0 = int ( box [ 1 ] / 1000 * im . size [ 0 ]) y1 = int ( box [ 2 ] / 1000 * im . size [ 1 ]) x1 = int ( box [ 3 ] / 1000 * im . size [ 0 ]) # Skip invalid boxes if y0 > = y1 or x0 > = x1 : continue # Process mask png_str = item [ "mask" ] if not png_str . startswith ( "data:image/png;base64," ): continue # Remove prefix png_str = png_str . removeprefix ( "data:image/png;base64," ) mask_data = base64 . b64decode ( png_str ) mask = Image . open ( io . BytesIO ( mask_data )) # Resize mask to match bounding box mask = mask . resize (( x1 - x0 , y1 - y0 ), Image . Resampling . BILINEAR ) # Convert mask to numpy array for processing mask_array = np . array ( mask ) # Create overlay for this mask overlay = Image . new ( 'RGBA' , im . size , ( 0 , 0 , 0 , 0 )) overlay_draw = ImageDraw . Draw ( overlay ) # Create overlay for the mask color = ( 255 , 255 , 255 , 200 ) for y in range ( y0 , y1 ): for x in range ( x0 , x1 ): if mask_array [ y - y0 , x - x0 ] > 128 : # Threshold for mask overlay_draw . point (( x , y ), fill = color ) # Save individual mask and its overlay mask_filename = f " { item [ 'label' ] } _ { i } _mask.png" \ No newline at end of file diff --git a/docstore/ac458d19-8870-45fa-b049-0ae438f0798f b/docstore/ac458d19-8870-45fa-b049-0ae438f0798f new file mode 100644 index 0000000000000000000000000000000000000000..4b3d79fcf31020903c40df052c1807fd4a690d51 --- /dev/null +++ b/docstore/ac458d19-8870-45fa-b049-0ae438f0798f @@ -0,0 +1 @@ +like photography descriptors, shapes and materials, historical art movements, and image quality modifiers. Photography Prompt includes: "A photo of..." To use this style, start with using keywords that clearly tell Imagen that you're looking for a photograph. Start your prompts with "A photo of. . ." . For example: Prompt: A photo of coffee beans in a kitchen on a wooden surface Prompt: A photo of a chocolate bar on a kitchen counter Prompt: A photo of a modern building with water in the background Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Photography modifiers In the following examples, you can see several photography-specific modifiers and parameters. You can combine multiple modifiers for more precise control. Camera Proximity - Close up, taken from far away Prompt: A close-up photo of coffee beans Prompt: A zoomed out photo of a small bag of coffee beans in a messy kitchen Camera Position - aerial, from below Prompt: aerial photo of urban city with skyscrapers Prompt: A photo of a forest canopy with blue skies from below Lighting - natural, dramatic, warm, cold Prompt: studio photo of a modern arm chair, natural lighting Prompt: studio photo of a modern arm chair, dramatic lighting Camera Settings - motion blur, soft focus, bokeh, portrait Prompt: photo of a city with skyscrapers from the inside of a car with motion blur Prompt: soft focus photograph of a bridge in an urban city at night Lens types - 35mm, 50mm, fisheye, wide angle, macro Prompt: photo of a leaf, macro lens Prompt: street photography, new york city, fisheye lens Film types - black and white, polaroid Prompt: a polaroid portrait of a dog wearing sunglasses Prompt: black and white photo of a dog wearing sunglasses Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Illustration and art Prompt includes: "A painting of..." , "A sketch of..." Art styles vary from monochrome styles like pencil \ No newline at end of file diff --git a/docstore/ac57123d-3fcf-4804-94db-c1ce03aea502 b/docstore/ac57123d-3fcf-4804-94db-c1ce03aea502 new file mode 100644 index 0000000000000000000000000000000000000000..c651df5a8dc16b6ecacfcfbb978b8a98fa66456b --- /dev/null +++ b/docstore/ac57123d-3fcf-4804-94db-c1ce03aea502 @@ -0,0 +1 @@ +Live API capabilities guide | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Live API capabilities guide Preview: The Live API is in preview. This is a comprehensive guide that covers capabilities and configurations available with the Live API. See Get started with Live API page for a overview and sample code for common use cases. Before you begin Familiarize yourself with core concepts: If you haven't already done so, read the Get started with Live API page first. This will introduce you to the fundamental principles of the Live API, how it works, and the distinction between the different models and their corresponding audio generation methods ( native audio or half-cascade). Try the Live API in AI Studio: You may find it useful to try the Live API in Google AI Studio before you start building. To use the Live API in Google AI Studio, select Stream . Establishing a connection The following example shows how to create a connection with an API key: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : print ( "Session started" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function main () { const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug \ No newline at end of file diff --git a/docstore/ac688ac9-ee67-436b-9dd0-1cffb9a0f202 b/docstore/ac688ac9-ee67-436b-9dd0-1cffb9a0f202 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/ac688ac9-ee67-436b-9dd0-1cffb9a0f202 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/ac973e12-6629-443a-86d1-2f362d0a0b53 b/docstore/ac973e12-6629-443a-86d1-2f362d0a0b53 new file mode 100644 index 0000000000000000000000000000000000000000..f1254740842b565388da4397f822e545092a5b88 --- /dev/null +++ b/docstore/ac973e12-6629-443a-86d1-2f362d0a0b53 @@ -0,0 +1 @@ +NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) stream := chat . SendMessageStream ( ctx , genai . Part { Text : "How many paws are in my house?" }) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt = sse \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Supported models All models in the Gemini family support text generation. To learn more about the models \ No newline at end of file diff --git a/docstore/acac1855-819d-4943-b134-d6f42cb485e9 b/docstore/acac1855-819d-4943-b134-d6f42cb485e9 new file mode 100644 index 0000000000000000000000000000000000000000..44d10ad654e1ae877f525afe1fc1f8db1da83e76 --- /dev/null +++ b/docstore/acac1855-819d-4943-b134-d6f42cb485e9 @@ -0,0 +1 @@ +get_current_temperature ( location : str ) - > dict : """Gets the current temperature for a given location. Args: location: The city and state, e.g. San Francisco, CA Returns: A dictionary containing the temperature and unit. """ # ... (implementation) ... return { "temperature" : 25 , "unit" : "Celsius" } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_current_temperature ] ) # Pass the function itself # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What's the temperature in Boston?" , config = config , ) print ( response . text ) # The SDK handles the function call and returns the final text You can disable automatic function calling with: Python config = types . GenerateContentConfig ( tools = [ get_current_temperature ], automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ) ) Automatic function schema declaration Automatic schema extraction from Python functions doesn't work in all cases. For example, it doesn't handle cases where you describe the fields of a nested dictionary-object. The API is able to describe any of the following types: Python AllowedType = ( int | float | bool | str | list [ 'AllowedType' ] | dict [ str , AllowedType ]) To see what the inferred schema looks like, you can convert it using from_callable : Python def multiply ( a : float , b : float ): """Returns a * b.""" return a * b fn_decl = types . FunctionDeclaration . from_callable ( callable = multiply , client = client ) # to_json_dict() provides a clean JSON representation. print ( fn_decl . to_json_dict ()) Multi-tool use: Combine native tools with function calling You can enable multiple tools combining native tools with function calling at the same time. Here's an example that enables two tools, Grounding with Google Search and code execution , in a request using the Live API . Note: Multi-tool use is a- Live API only feature at the \ No newline at end of file diff --git a/docstore/acada8d3-7016-437f-aa26-ba5afbffb5ef b/docstore/acada8d3-7016-437f-aa26-ba5afbffb5ef new file mode 100644 index 0000000000000000000000000000000000000000..54ff3139001cad531cb76ca5ae25b2688a321ffa --- /dev/null +++ b/docstore/acada8d3-7016-437f-aa26-ba5afbffb5ef @@ -0,0 +1 @@ +angle," "worms eye," "dolly shot," "zoom shot," "pan shot," and "tracking shot." Focus and lens effects: Use terms like "shallow focus," "deep focus," "soft focus," "macro lens," and "wide-angle lens" to achieve specific visual effects. Overall style and subject: Guide Veo's creative direction by specifying styles like "sci-fi," "romantic comedy," "action movie," or "animation." You can also describe the subjects and backgrounds you want, such as "cityscape," "nature," "vehicles," or "animals." Veo prompt guide This section of the Veo guide contains examples of videos you can create using Veo, and shows you how to modify prompts to produce distinct results. Safety filters Veo applies safety filters across Gemini to help ensure that generated videos and uploaded photos don't contain offensive content. Prompts that violate our terms and guidelines are blocked. Prompt writing basics Good prompts are descriptive and clear. To get your generated video as close as possible to what you want, start with identifying your core idea, and then refine your idea by adding keywords and modifiers. The following elements should be included in your prompt: Subject : The object, person, animal, or scenery that you want in your video. Context : The background or context in which the subject is placed. Action : What the subject is doing (for example, walking , running , or turning their head ). Style : This can be general or very specific. Consider using specific film style keywords, such as horror film , film noir , or animated styles like cartoon style. Camera motion : [Optional] What the camera is doing, such as aerial view , eye-level , top-down shot , or low-angle shot . Composition : [Optional] How the shot is framed, such as wide shot , close-up , or extreme close-up . Ambiance : [Optional] How the color and light contribute to the scene, such as blue tones , night , or warm tones . More tips for writing prompts The following tips help you write prompts that generate your videos: \ No newline at end of file diff --git a/docstore/acd38a3a-4758-46fa-a68f-62fcac1a7be0 b/docstore/acd38a3a-4758-46fa-a68f-62fcac1a7be0 new file mode 100644 index 0000000000000000000000000000000000000000..bb8a5c387274cea53762666893b97a549fe37a5c --- /dev/null +++ b/docstore/acd38a3a-4758-46fa-a68f-62fcac1a7be0 @@ -0,0 +1 @@ +Client () response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = "Provide a list of 3 famous physicists and their key contributions" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 1024 ) # Turn off thinking: # thinking_config=types.ThinkingConfig(thinking_budget=0) # Turn on dynamic thinking: # thinking_config=types.ThinkingConfig(thinking_budget=-1) ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : "Provide a list of 3 famous physicists and their key contributions" , config : { thinkingConfig : { thinkingBudget : 1024 , // Turn off thinking: // thinkingBudget: 0 // Turn on dynamic thinking: // thinkingBudget: -1 }, }, }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "google.golang.org/genai" "os" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } thinkingBudgetVal := int32 ( 1024 ) contents := genai . Text ( "Provide a list of 3 famous physicists and their key contributions" ) model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : & thinkingBudgetVal , // Turn off thinking: // ThinkingBudget: int32(0), // Turn on dynamic thinking: // ThinkingBudget: int32(-1), }, }) fmt . Println ( resp . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Provide a list of 3 famous physicists and their key contributions" } ] } ], "generationConfig": { \ No newline at end of file diff --git a/docstore/acf93daf-0b0f-41cd-8d8e-14f821cf6721 b/docstore/acf93daf-0b0f-41cd-8d8e-14f821cf6721 new file mode 100644 index 0000000000000000000000000000000000000000..6acfe7ba02c229c83724a8d4179d85fae6727e02 --- /dev/null +++ b/docstore/acf93daf-0b0f-41cd-8d8e-14f821cf6721 @@ -0,0 +1 @@ +images before performing its reasoning task. Prompt Model response What is in common between these images? All four images are examples of landscape photography. Updated prompt Improved response First, describe what's in each image in detail. What's in common between these images? The first image shows a person packing a suitcase. The second image shows an empty hallway in a shopping mall. The third image shows a roll of toilet paper. The fourth image shows a dark and stormy sea. The commonality between these images is that they are all examples of liminal spaces. Liminal spaces are places or situations that are in between two other places or situations. They are often characterized by a sense of uncertainty or ambiguity. The first image shows the person in between being at home and being on a trip. The second image shows the hallway in between two stores. The third image is of toilet paper, which can represent a bathroom. A bathroom is a transitional space between private and public realms. It's a place where we prepare ourselves to enter or leave the outside world. The fourth image shows the sea in between being calm and being stormy. Liminal spaces can be both exciting and unsettling. They can be a place of new beginning or a place of endings. They can be a place of reflection or a place of action. You can also try explicitly asking the model to refer to what's in the image. Updated prompt Improved response What is in common between these images? Refer to what's in the images in your response. All of the images are related to travel. The first image shows a person packing a suitcase, the second image shows an empty hallway in a shopping mall, the third image shows a stack of toilet paper, and the fourth image shows a dark and stormy sea. These images are all related to travel because they are all things that people need or use when they travel. The first image shows a person packing a suitcase, which is something that people do when they are going on a trip. The \ No newline at end of file diff --git a/docstore/acfabf26-c120-4d09-a62b-de39787402f2 b/docstore/acfabf26-c120-4d09-a62b-de39787402f2 new file mode 100644 index 0000000000000000000000000000000000000000..0a87c36dd70c54ff062f2d96388cf5344fb768a1 --- /dev/null +++ b/docstore/acfabf26-c120-4d09-a62b-de39787402f2 @@ -0,0 +1 @@ +process the response and check for Function Call, if Yes : Extract the name and args of the function and execute the corresponding function in your application. No: The model has provided a direct text response to the prompt (this flow is less emphasized in the example but is a possible outcome). Create User friendly response: If a function was executed, capture the result and send it back to the model in a subsequent turn of the conversation. It will use the result to generate a final, user-friendly response that incorporates the information from the function call. This process can be repeated over multiple turns, allowing for complex interactions and workflows. The model also supports calling multiple functions in a single turn ( parallel function calling ) and in sequence ( compositional function calling ). Step 1: Define a function declaration Define a function and its declaration within your application code that allows users to set light values and make an API request. This function could call external services or APIs. Python # Define a function that the model can call to control smart lights set_light_values_declaration = { "name" : "set_light_values" , "description" : "Sets the brightness and color temperature of a light." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "integer" , "description" : "Light level from 0 to 100. Zero is off and 100 is full brightness" , }, "color_temp" : { "type" : "string" , "enum" : [ "daylight" , "cool" , "warm" ], "description" : "Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`." , }, }, "required" : [ "brightness" , "color_temp" ], }, } # This is the actual function that would be called based on the model's suggestion def set_light_values ( brightness : int , color_temp : str ) - > dict [ str , int | str ]: """Set the brightness and color temperature of a room light. (mock API). Args: brightness: Light level from 0 to 100. Zero is off and 100 is full \ No newline at end of file diff --git a/docstore/ad055d0a-836e-4750-a329-30fbd1fc5982 b/docstore/ad055d0a-836e-4750-a329-30fbd1fc5982 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/ad055d0a-836e-4750-a329-30fbd1fc5982 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/ad10cbcd-8835-4d70-929c-cdf7d0de9e92 b/docstore/ad10cbcd-8835-4d70-929c-cdf7d0de9e92 new file mode 100644 index 0000000000000000000000000000000000000000..c2369ca5049154f630fe926e06160c0364720f7c --- /dev/null +++ b/docstore/ad10cbcd-8835-4d70-929c-cdf7d0de9e92 @@ -0,0 +1 @@ +const turns = await handleTurn (); // Combine audio data strings and save as wave file const combinedAudio = turns . reduce (( acc , turn ) = > { if ( turn . data ) { const buffer = Buffer . from ( turn . data , 'base64' ); const intArray = new Int16Array ( buffer . buffer , buffer . byteOffset , buffer . byteLength / Int16Array . BYTES_PER_ELEMENT ); return acc . concat ( Array . from ( intArray )); } return acc ; }, []); const audioBuffer = new Int16Array ( combinedAudio ); const wf = new WaveFile (); wf . fromScratch ( 1 , 24000 , '16' , audioBuffer ); // output is 24kHz fs . writeFileSync ( 'audio.wav' , wf . toBuffer ()); session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); What's next Read the full Live API Capabilities guide for key capabilities and configurations; including Voice Activity Detection and native audio features. Read the Tool use guide to learn how to integrate Live API with tools and function calling. Read the Session management guide for managing long running conversations. Read the Ephemeral tokens guide for secure authentication in client-to-server applications. For more information about the underlying WebSockets API, see the WebSockets API reference . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/ad2e4bff-a4c2-4698-84ba-36e57a894460 b/docstore/ad2e4bff-a4c2-4698-84ba-36e57a894460 new file mode 100644 index 0000000000000000000000000000000000000000..acef50f8402c486c348013efe146de15b88cb32b --- /dev/null +++ b/docstore/ad2e4bff-a4c2-4698-84ba-36e57a894460 @@ -0,0 +1 @@ +Function calling with the Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Function calling with the Gemini API Function calling lets you connect models to external tools and APIs. Instead of generating text responses, the model determines when to call specific functions and provides the necessary parameters to execute real-world actions. This allows the model to act as a bridge between natural language and real-world actions and data. Function calling has 3 primary use cases: Augment Knowledge: Access information from external sources like databases, APIs, and knowledge bases. Extend Capabilities: Use external tools to perform computations and extend the limitations of the model, such as using a calculator or creating charts. Take Actions: Interact with external systems using APIs, such as scheduling appointments, creating invoices, sending emails, or controlling smart home devices. Get Weather Schedule Meeting Create Chart How function calling works Function calling involves a structured interaction between your application, the model, and external functions. Here's a breakdown of the process: Define Function Declaration: Define the function declaration in your application code. Function Declarations describe the function's name, parameters, and purpose to the model. Call LLM with function declarations: Send user prompt along with the function declaration(s) to the model. It analyzes the request and determines if a function call would be helpful. If so, it responds with a structured JSON object. Execute Function Code (Your Responsibility): The Model does not execute the function itself. It's your application's responsibility to \ No newline at end of file diff --git a/docstore/ad3668fc-ee4a-4e70-a5b9-9ddf75134eb2 b/docstore/ad3668fc-ee4a-4e70-a5b9-9ddf75134eb2 new file mode 100644 index 0000000000000000000000000000000000000000..805c4d57924af50a1afef3b5a11f57c1232fc3f8 --- /dev/null +++ b/docstore/ad3668fc-ee4a-4e70-a5b9-9ddf75134eb2 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/system-instructions Title: Text generation | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/ad5e65e6-80e0-417f-8424-10b2d763ddf6 b/docstore/ad5e65e6-80e0-417f-8424-10b2d763ddf6 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/ad5e65e6-80e0-417f-8424-10b2d763ddf6 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/ad749bf4-60d4-4018-8a26-6223b3f7f27d b/docstore/ad749bf4-60d4-4018-8a26-6223b3f7f27d new file mode 100644 index 0000000000000000000000000000000000000000..c1215478fcfc0a791e03023378a8264733510dd8 --- /dev/null +++ b/docstore/ad749bf4-60d4-4018-8a26-6223b3f7f27d @@ -0,0 +1 @@ +connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.pcm" ); const base64Audio = Buffer . from ( fileBuffer ). toString ( 'base64' ); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); // if stream gets paused, send: // session.sendRealtimeInput({ audioStreamEnd: true }) const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); With send_realtime_input , the API will respond to audio automatically based on VAD. While send_client_content adds messages to the model context in order, send_realtime_input is optimized for responsiveness at the expense of deterministic ordering. Automatic VAD configuration For more control over the VAD activity, you can configure the following parameters. See API reference for more info. Python from google.genai import types config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : False , # default "start_of_speech_sensitivity" : types . StartSensitivity . START_SENSITIVITY_LOW , "end_of_speech_sensitivity" : types . EndSensitivity . END_SENSITIVITY_LOW , "prefix_padding_ms" : 20 , "silence_duration_ms" : 100 , } } } JavaScript import { GoogleGenAI , Modality , StartSensitivity , EndSensitivity } from '@google/genai' ; const config = { responseModalities : [ Modality . \ No newline at end of file diff --git a/docstore/adb6d2c7-6302-444b-8a2f-2f4498559d4c b/docstore/adb6d2c7-6302-444b-8a2f-2f4498559d4c new file mode 100644 index 0000000000000000000000000000000000000000..6479a4b50897c899a1b9742e0d69348c2776f1d5 --- /dev/null +++ b/docstore/adb6d2c7-6302-444b-8a2f-2f4498559d4c @@ -0,0 +1 @@ +config = types . GenerateImagesConfig ( aspect_ratio = "16:9" , number_of_images = 1 ) ) imagen . generated_images [ 0 ] . image JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const response = await ai . models . generateImages ({ model : "imagen-3.0-generate-002" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , config : { numberOfImages : 1 , }, }); // you'll pass response.generatedImages[0].image.imageBytes to Veo Go package main import ( "context" "fmt" "os" "time" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateImagesConfig { AspectRatio : "16:9" , NumberOfImages : 1 , } response , _ := client . Models . GenerateImages ( ctx , "imagen-3.0-generate-002" , "Panning wide shot of a calico kitten sleeping in the sunshine" , config , ) // you'll pass response.GeneratedImages[0].Image to Veo } Then, generate a video using the resulting image as the first frame: Python operation = client . models . generate_videos ( model = "veo-2.0-generate-001" , prompt = prompt , image = imagen . generated_images [ 0 ] . image , config = types . GenerateVideosConfig ( person_generation = "dont_allow" , # "dont_allow" or "allow_adult" aspect_ratio = "16:9" , # "16:9" or "9:16" number_of_videos = 2 ), ) # Wait for videos to generate while not operation . done : time . sleep ( 20 ) operation = client . operations . get ( operation ) for n , video in enumerate ( operation . response . generated_videos ): fname = f 'with_image_input { n } .mp4' print ( fname ) client . files . download ( file = video . video ) video . video . save ( fname ) JavaScript import { GoogleGenAI } from "@google/genai" ; import { createWriteStream } from "fs" ; import { Readable } from "stream" ; const ai = new GoogleGenAI ({}); async function main () { // get image bytes from Imagen, as shown above let \ No newline at end of file diff --git a/docstore/adc01d86-f2c3-42b7-be6d-fb6b78e960a4 b/docstore/adc01d86-f2c3-42b7-be6d-fb6b78e960a4 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/adc01d86-f2c3-42b7-be6d-fb6b78e960a4 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/add91ebf-d277-415c-b929-18b8e6158135 b/docstore/add91ebf-d277-415c-b929-18b8e6158135 new file mode 100644 index 0000000000000000000000000000000000000000..3d45fbcc0883100cfd6a3c6121a97b18a22961cd --- /dev/null +++ b/docstore/add91ebf-d277-415c-b929-18b8e6158135 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/caching Title: Context caching | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/adededb4-0727-4940-bfb8-4416c3647c02 b/docstore/adededb4-0727-4940-bfb8-4416c3647c02 new file mode 100644 index 0000000000000000000000000000000000000000..aa68cf0548cfb2585cc033502540f8f159dbfde7 --- /dev/null +++ b/docstore/adededb4-0727-4940-bfb8-4416c3647c02 @@ -0,0 +1 @@ +Gemini API has built-in protections against core harms, such as content that endangers child safety. These types of harm are always blocked and cannot be adjusted. Content safety filtering level The Gemini API categorizes the probability level of content being unsafe as HIGH , MEDIUM , LOW , or NEGLIGIBLE . The Gemini API blocks content based on the probability of content being unsafe and not the severity. This is important to consider because some content can have low probability of being unsafe even though the severity of harm could still be high. For example, comparing the sentences: The robot punched me. The robot slashed me up. The first sentence might result in a higher probability of being unsafe, but you might consider the second sentence to be a higher severity in terms of violence. Given this, it is important that you carefully test and consider what the appropriate level of blocking is needed to support your key use cases while minimizing harm to end users. Safety filtering per request You can adjust the safety settings for each request you make to the API. When you make a request, the content is analyzed and assigned a safety rating. The safety rating includes the category and the probability of the harm classification. For example, if the content was blocked due to the harassment category having a high probability, the safety rating returned would have category equal to HARASSMENT and harm probability set to HIGH . By default, safety settings block content (including prompts) with medium or higher probability of being unsafe across any filter. This baseline safety is designed to work for most use cases, so you should only adjust your safety settings if it's consistently required for your application. The following table describes the block settings you can adjust for each category. For example, if you set the block setting to Block few for the Hate speech category, everything that has a high probability of being hate speech content is blocked. But \ No newline at end of file diff --git a/docstore/ae0054ce-18b0-4661-8bca-8c4a29a7ac26 b/docstore/ae0054ce-18b0-4661-8bca-8c4a29a7ac26 new file mode 100644 index 0000000000000000000000000000000000000000..1a983868db701f2f2de527a3764468fc245fbfe1 --- /dev/null +++ b/docstore/ae0054ce-18b0-4661-8bca-8c4a29a7ac26 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/structured-output Title: Structured output | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/ae0967a2-1b05-4d09-9e4b-b06341115143 b/docstore/ae0967a2-1b05-4d09-9e4b-b06341115143 new file mode 100644 index 0000000000000000000000000000000000000000..b9aae1f02a8caa7a25135d3bec800921c05dfc11 --- /dev/null +++ b/docstore/ae0967a2-1b05-4d09-9e4b-b06341115143 @@ -0,0 +1 @@ +( response . choices [ 0 ] . message . content ) JavaScript import fs from "fs" ; import OpenAI from "openai" ; const client = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); const audioFile = fs . readFileSync ( "/path/to/your/audio/file.wav" ); const base64Audio = Buffer . from ( audioFile ). toString ( "base64" ); async function main () { const response = await client . chat . completions . create ({ model : "gemini-2.0-flash" , messages : [ { role : "user" , content : [ { type : "text" , text : "Transcribe this audio" , }, { type : "input_audio" , input_audio : { data : base64Audio , format : "wav" , }, }, ], }, ], }); console . log ( response . choices [ 0 ]. message . content ); } main (); REST Note: If you get an Argument list too long error, the encoding of your audio file might be too long for curl. bash -c ' base64_audio=$(base64 -i "/path/to/your/audio/file.wav"); curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d "{ \"model\": \"gemini-2.0-flash\", \"messages\": [ { \"role\": \"user\", \"content\": [ { \"type\": \"text\", \"text\": \"Transcribe this audio file.\" }, { \"type\": \"input_audio\", \"input_audio\": { \"data\": \"${base64_audio}\", \"format\": \"wav\" } } ] } ] }" ' Structured output Gemini models can output JSON objects in any structure you define . Python from pydantic import BaseModel from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) class CalendarEvent ( BaseModel ): name : str date : str participants : list [ str ] completion = client . beta . chat . completions . parse ( model = "gemini-2.0-flash" , messages = [ { "role" : "system" , "content" : "Extract the event information." }, { "role" : "user" , "content" : "John and Susan are going to an AI conference on \ No newline at end of file diff --git a/docstore/ae2320ed-d9ac-4dc7-a91d-20c2431d0d9a b/docstore/ae2320ed-d9ac-4dc7-a91d-20c2431d0d9a new file mode 100644 index 0000000000000000000000000000000000000000..1b38806cd93f51329872622740eee5221d7a7d7e --- /dev/null +++ b/docstore/ae2320ed-d9ac-4dc7-a91d-20c2431d0d9a @@ -0,0 +1 @@ +18°C." , config = config , ) # Print the final, user-facing response print ( response . text ) Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . I 've set the thermostat to 20°C. JavaScript This example shows how to use JavaScript/TypeScript SDK to do comopositional function calling using a manual execution loop. import { GoogleGenAI , Type } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Example Functions function get_weather_forecast ({ location }) { console . log ( `Tool Call: get_weather_forecast(location= ${ location } )` ); // TODO: Make API call console . log ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ); return { temperature : 25 , unit : "celsius" }; } function set_thermostat_temperature ({ temperature }) { console . log ( `Tool Call: set_thermostat_temperature(temperature= ${ temperature } )` , ); // TODO: Make API call console . log ( "Tool Response: {'status': 'success'}" ); return { status : "success" }; } const toolFunctions = { get_weather_forecast , set_thermostat_temperature , }; const tools = [ { functionDeclarations : [ { name : "get_weather_forecast" , description : "Gets the current weather temperature for a given location." , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , }, }, required : [ "location" ], }, }, { name : "set_thermostat_temperature" , description : "Sets the thermostat to a desired temperature." , parameters : { type : Type . OBJECT , properties : { temperature : { type : Type . NUMBER , }, }, required : [ \ No newline at end of file diff --git a/docstore/ae279f99-586c-4c67-a575-5a353a5256f8 b/docstore/ae279f99-586c-4c67-a575-5a353a5256f8 new file mode 100644 index 0000000000000000000000000000000000000000..5df22ab370b3c2108c2a4e677731cc4af2835ff9 --- /dev/null +++ b/docstore/ae279f99-586c-4c67-a575-5a353a5256f8 @@ -0,0 +1 @@ +variables, if you don't pass one to the client. export GEMINI_API_KEY = "YOUR_API_KEY" from google import genai client = genai . Client () # Set the API key using the GEMINI_API_KEY env var. # Alternatively, you could set the API key explicitly: # client = genai.Client(api_key="your_api_key") JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); Go Import the GenAI library: import "google.golang.org/genai" Create the client: client , err := genai . NewClient ( ctx , & genai . ClientConfig { Backend : genai . BackendGeminiAPI , }) Generate content Text Before Python Previously, there were no client objects, you accessed APIs directly through GenerativeModel objects. import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ( 'Tell me a story in 300 words' ) print ( response . text ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( process . env . API_KEY ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); const prompt = "Tell me a story in 300 words" ; const result = await model . generateContent ( prompt ); console . log ( result . response . text ()); Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( "gemini-1.5-flash" ) resp , err := model . GenerateContent ( ctx , genai . Text ( "Tell me a story in 300 words." )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) // utility for printing response parts After Python The new Google GenAI SDK provides access to all the API methods through the Client object. Except for a few stateful special cases ( chat and live-api session s), these are all stateless functions. For utility and uniformity, objects returned are pydantic classes. \ No newline at end of file diff --git a/docstore/ae456d72-e7bf-4db3-ae97-b2d3e5df6ed8 b/docstore/ae456d72-e7bf-4db3-ae97-b2d3e5df6ed8 new file mode 100644 index 0000000000000000000000000000000000000000..3f35d7c2ee0452cbbcb055812399e279fb8f7031 --- /dev/null +++ b/docstore/ae456d72-e7bf-4db3-ae97-b2d3e5df6ed8 @@ -0,0 +1 @@ +$GEMINI_API_KEY " \ -X POST \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts":[{ "text": "TTS the following conversation between Joe and Jane: Joe: Hows it going today Jane? Jane: Not too bad, how about you?" }] }], "generationConfig": { "responseModalities": ["AUDIO"], "speechConfig": { "multiSpeakerVoiceConfig": { "speakerVoiceConfigs": [{ "speaker": "Joe", "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Kore" } } }, { "speaker": "Jane", "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Puck" } } }] } } }, "model": "gemini-2.5-flash-preview-tts", }' | jq -r '.candidates[0].content.parts[0].inlineData.data' | \ base64 --decode > out.pcm # You may need to install ffmpeg. ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm out.wav Controlling speech style with prompts You can control style, tone, accent, and pace using natural language prompts for both single- and multi-speaker TTS. For example, in a single-speaker prompt, you can say: Say in an spooky whisper: "By the pricking of my thumbs... Something wicked this way comes" In a multi-speaker prompt, provide the model with each speaker's name and corresponding transcript. You can also provide guidance for each speaker individually: Make Speaker1 sound tired and bored, and Speaker2 sound excited and happy: Speaker1: So... what's on the agenda today? Speaker2: You're never going to guess! Try using a voice option that corresponds to the style or emotion you want to convey, to emphasize it even more. In the previous prompt, for example, Enceladus 's breathiness might emphasize "tired" and "bored", while Puck 's upbeat tone could complement "excited" and "happy". Generating a prompt to convert to audio The TTS models only output audio, but you can use other models to generate a transcript first, then pass that transcript to the TTS model to read aloud. Python from google import genai from google.genai import types client = genai . Client () transcript = client . models . generate_content ( model \ No newline at end of file diff --git a/docstore/ae4f0a6f-29ae-4923-b16d-f05499639218 b/docstore/ae4f0a6f-29ae-4923-b16d-f05499639218 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/ae4f0a6f-29ae-4923-b16d-f05499639218 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/ae967280-312a-45c3-ab34-932097c1fe77 b/docstore/ae967280-312a-45c3-ab34-932097c1fe77 new file mode 100644 index 0000000000000000000000000000000000000000..9b6431ae9c97257d5ff4628dd401f203e2f83eb3 --- /dev/null +++ b/docstore/ae967280-312a-45c3-ab34-932097c1fe77 @@ -0,0 +1 @@ +Use curl to send a POST request to the predictLongRunning endpoint. # The request body includes the prompt for video generation. curl " ${ BASE_URL } /models/veo-2.0-generate-001:predictLongRunning" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -X "POST" \ -d '{ "instances": [{ "prompt": "Panning wide shot of a calico kitten sleeping in the sunshine" } ], "parameters": { "aspectRatio": "16:9", "personGeneration": "dont_allow", } }' | tee result.json | jq .name | sed 's/"//g' > op_name # Obtain operation name to download video. op_name = $( cat op_name ) # Check against status of operation. while true ; do is_done = $( curl -H "x-goog-api-key: $GEMINI_API_KEY " " ${ BASE_URL } / ${ op_name } " | tee op_check.json | jq .done ) if [ " ${ is_done } " = "true" ] ; then cat op_check.json echo "** Attach API_KEY to download video, or examine error message." break fi echo "** Video ${ op_name } has not downloaded yet! Check again after 5 seconds..." # Wait for 5 seoncds to check again. sleep 5 done This code takes about 2-3 minutes to run, though it may take longer if resources are constrained. Once it's done running, you should see a video that looks something like this: If you see an error message instead of a video, this means that resources are constrained and your request couldn't be completed. In this case, run the code again. Generated videos are stored on the server for 2 days, after which they are removed. If you want to save a local copy of your generated video, you must run result() and save() within 2 days of generation. Generate from images You can also generate videos using images. The following code generates an image using Imagen, then uses the generated image as the starting frame for the generated video. First, generate an image using Imagen : Python prompt = "Panning wide shot of a calico kitten sleeping in the sunshine" , imagen = client . models . generate_images ( model = "imagen-3.0-generate-002" , prompt = prompt , \ No newline at end of file diff --git a/docstore/aeb1b20e-43d0-41b2-9604-6cfcf86b8c83 b/docstore/aeb1b20e-43d0-41b2-9604-6cfcf86b8c83 new file mode 100644 index 0000000000000000000000000000000000000000..185cd7c14b73dd4804292716b4231cde98556b13 --- /dev/null +++ b/docstore/aeb1b20e-43d0-41b2-9604-6cfcf86b8c83 @@ -0,0 +1 @@ +Safety settings | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Safety settings The Gemini API provides safety settings that you can adjust during the prototyping stage to determine if your application requires more or less restrictive safety configuration. You can adjust these settings across five filter categories to restrict or allow certain types of content. This guide covers how the Gemini API handles safety settings and filtering and how you can change the safety settings for your application. Note: Applications that use less restrictive safety settings may be subject to review. See the Terms of Service for more information. Safety filters The Gemini API's adjustable safety filters cover the following categories: Category Description Harassment Negative or harmful comments targeting identity and/or protected attributes. Hate speech Content that is rude, disrespectful, or profane. Sexually explicit Contains references to sexual acts or other lewd content. Dangerous Promotes, facilitates, or encourages harmful acts. Civic integrity Election-related queries. These categories are defined in HarmCategory . The Gemini models only support HARM_CATEGORY_HARASSMENT , HARM_CATEGORY_HATE_SPEECH , HARM_CATEGORY_SEXUALLY_EXPLICIT , HARM_CATEGORY_DANGEROUS_CONTENT , and HARM_CATEGORY_CIVIC_INTEGRITY . All other categories are used only by PaLM 2 (Legacy) models. You can use these filters to adjust what's appropriate for your use case. For example, if you're building video game dialogue, you may deem it acceptable to allow more content that's rated as Dangerous due to the nature of the game. In addition to the adjustable safety filters, the \ No newline at end of file diff --git a/docstore/aec211c9-8fbd-4c8e-8e5a-0ee42c0550f5 b/docstore/aec211c9-8fbd-4c8e-8e5a-0ee42c0550f5 new file mode 100644 index 0000000000000000000000000000000000000000..ea67dfb950f5bf1f1ddc040da181eb1accb5974b --- /dev/null +++ b/docstore/aec211c9-8fbd-4c8e-8e5a-0ee42c0550f5 @@ -0,0 +1 @@ +breakdown:" ) for detail in usage . response_tokens_details : match detail : case types . ModalityTokenCount ( modality = modality , token_count = count ): print ( f " { modality } : { count } " ) JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . usageMetadata ) { console . debug ( 'Used %s tokens in total. Response token breakdown:\n' , turn . usageMetadata . totalTokenCount ); for ( const detail of turn . usageMetadata . responseTokensDetails ) { console . debug ( '%s\n' , detail ); } } } Media resolution You can specify the media resolution for the input media by setting the mediaResolution field as part of the session configuration: Python from google.genai import types config = { "response_modalities" : [ "AUDIO" ], "media_resolution" : types . MediaResolution . MEDIA_RESOLUTION_LOW , } JavaScript import { GoogleGenAI , Modality , MediaResolution } from '@google/genai' ; const config = { responseModalities : [ Modality . TEXT ], mediaResolution : MediaResolution . MEDIA_RESOLUTION_LOW , }; Limitations Consider the following limitations of the Live API when you plan your project. Response modalities You can only set one response modality ( TEXT or AUDIO ) per session in the session configuration. Setting both results in a config error message. This means that you can configure the model to respond with either text or audio, but not both in the same session. Client authentication The Live API only provides server-to-server authentication by default. If you're implementing your Live API application using a client-to-server approach , you need to use ephemeral tokens to mitigate security risks. Session duration Audio-only sessions are limited to 15 minutes, and audio plus video sessions are limited to 2 minutes. However, you can configure different session management techniques for unlimited extensions on session duration. Context window A session has a context window limit of: 128k tokens for native audio output models 32k \ No newline at end of file diff --git a/docstore/aec2310b-5578-4e85-beaa-1de20bc32da8 b/docstore/aec2310b-5578-4e85-beaa-1de20bc32da8 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/aec2310b-5578-4e85-beaa-1de20bc32da8 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/aeda00e8-7b36-43a6-b520-c131534d4ba7 b/docstore/aeda00e8-7b36-43a6-b520-c131534d4ba7 new file mode 100644 index 0000000000000000000000000000000000000000..8759a03a20a3177c7734cd1638fb9c60e8d9d57e --- /dev/null +++ b/docstore/aeda00e8-7b36-43a6-b520-c131534d4ba7 @@ -0,0 +1 @@ +popularized by short form video apps (for example, YouTube shorts). Use this for tall objects with strong vertical orientations such as buildings, trees, waterfalls, or other similar objects. Prompt: a digital render of a massive skyscraper, modern, grand, epic with a beautiful sunset in the background (9:16 aspect ratio) Photorealistic images Different versions of the image generation model might offer a mix of artistic and photorealistic output. Use the following wording in prompts to generate more photorealistic output, based on the subject you want to generate. Note: Take these keywords as general guidance when you try to create photorealistic images. They aren't required to achieve your goal. Use case Lens type Focal lengths Additional details People (portraits) Prime, zoom 24-35mm black and white film, Film noir, Depth of field, duotone (mention two colors) Food, insects, plants (objects, still life) Macro 60-105mm High detail, precise focusing, controlled lighting Sports, wildlife (motion) Telephoto zoom 100-400mm Fast shutter speed, Action or movement tracking Astronomical, landscape (wide-angle) Wide-angle 10-24mm Long exposure times, sharp focus, long exposure, smooth water or clouds Portraits Use case Lens type Focal lengths Additional details People (portraits) Prime, zoom 24-35mm black and white film, Film noir, Depth of field, duotone (mention two colors) Using several keywords from the table, Imagen can generate the following portraits: Prompt: A woman, 35mm portrait, blue and grey duotones Model: imagen-3.0-generate-002 Prompt: A woman, 35mm portrait, film noir Model: imagen-3.0-generate-002 Objects Use case Lens type Focal lengths Additional details Food, insects, plants (objects, still life) Macro 60-105mm High detail, precise focusing, controlled lighting Using several keywords from the table, Imagen can generate the following object images: Prompt: leaf of a prayer plant, macro lens, 60mm Model: imagen-3.0-generate-002 Prompt: a plate of pasta, \ No newline at end of file diff --git a/docstore/aedb7039-8ac0-4adc-8bd8-edfc0bb004e7 b/docstore/aedb7039-8ac0-4adc-8bd8-edfc0bb004e7 new file mode 100644 index 0000000000000000000000000000000000000000..6b5570dc552776eef13cf8339199673fd1c28eb5 --- /dev/null +++ b/docstore/aedb7039-8ac0-4adc-8bd8-edfc0bb004e7 @@ -0,0 +1 @@ +Generate an image Note: Image generation is only available in the paid tier. Generate an image: Python import base64 from openai import OpenAI from PIL import Image from io import BytesIO client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" , ) response = client . images . generate ( model = "imagen-3.0-generate-002" , prompt = "a portrait of a sheepadoodle wearing a cape" , response_format = 'b64_json' , n = 1 , ) for image_data in response . data : image = Image . open ( BytesIO ( base64 . b64decode ( image_data . b64_json ))) image . show () JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); async function main () { const image = await openai . images . generate ( { model : "imagen-3.0-generate-002" , prompt : "a portrait of a sheepadoodle wearing a cape" , response_format : "b64_json" , n : 1 , } ); console . log ( image . data ); } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/images/generations" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "imagen-3.0-generate-002", "prompt": "a portrait of a sheepadoodle wearing a cape", "response_format": "b64_json", "n": 1, }' Audio understanding Analyze audio input: Python import base64 from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) with open ( "/path/to/your/audio/file.wav" , "rb" ) as audio_file : base64_audio = base64 . b64encode ( audio_file . read ()) . decode ( 'utf-8' ) response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = [ { "role" : "user" , "content" : [ { "type" : "text" , "text" : "Transcribe this audio" , }, { "type" : "input_audio" , "input_audio" : { "data" : base64_audio , "format" : "wav" } } ], } ], ) print \ No newline at end of file diff --git a/docstore/aef849be-d013-4cf1-ab1f-76fc25f7d1c5 b/docstore/aef849be-d013-4cf1-ab1f-76fc25f7d1c5 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/aef849be-d013-4cf1-ab1f-76fc25f7d1c5 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/af1da011-d90e-4ce9-9d47-9b258940fc07 b/docstore/af1da011-d90e-4ce9-9d47-9b258940fc07 new file mode 100644 index 0000000000000000000000000000000000000000..2b6e55e3ae415c04ff420e9e56413156ffa5e0fd --- /dev/null +++ b/docstore/af1da011-d90e-4ce9-9d47-9b258940fc07 @@ -0,0 +1 @@ +a sample rate of 24kHz. Python # Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav # Install helpers for converting files: pip install librosa soundfile import asyncio import io from pathlib import Path import wave from google import genai from google.genai import types import soundfile as sf import librosa client = genai . Client () # Half cascade model: # model = "gemini-live-2.5-flash-preview" # Native audio output model: model = "gemini-2.5-flash-preview-native-audio-dialog" config = { "response_modalities" : [ "AUDIO" ], "system_instruction" : "You are a helpful assistant and answer in a friendly tone." , } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : buffer = io . BytesIO () y , sr = librosa . load ( "sample.wav" , sr = 16000 ) sf . write ( buffer , y , sr , format = 'RAW' , subtype = 'PCM_16' ) buffer . seek ( 0 ) audio_bytes = buffer . read () # If already in correct format, you can use this: # audio_bytes = Path("sample.pcm").read_bytes() await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) wf = wave . open ( "audio.wav" , "wb" ) wf . setnchannels ( 1 ) wf . setsampwidth ( 2 ) wf . setframerate ( 24000 ) # Output is 24kHz async for response in session . receive (): if response . data is not None : wf . writeframes ( response . data ) # Un-comment this code to print audio data info # if response.server_content.model_turn is not None: # print(response.server_content.model_turn.parts[0].inline_data.mime_type) wf . close () if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; // npm install wavefile const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); // WARNING: Do not use API keys in \ No newline at end of file diff --git a/docstore/af4a7669-a50e-4fe9-b7a4-c0ede4afb348 b/docstore/af4a7669-a50e-4fe9-b7a4-c0ede4afb348 new file mode 100644 index 0000000000000000000000000000000000000000..54ee11bfb756db29fb776eb5a6d4247407dfa205 --- /dev/null +++ b/docstore/af4a7669-a50e-4fe9-b7a4-c0ede4afb348 @@ -0,0 +1 @@ +(Default/Some Thinking): Many common requests benefit from a degree of step-by-step processing or deeper understanding. Gemini can flexibly use thinking capability for tasks like: Analogize photosynthesis and growing up. Compare and contrast electric cars and hybrid cars. Hard Tasks (Maximum Thinking Capability): For truly complex challenges, such as solving complex math problems or coding tasks, we recommend setting a high thinking budget. These types of tasks require the model to engage its full reasoning and planning capabilities, often involving many internal steps before providing an answer. Examples include: Solve problem 1 in AIME 2025: Find the sum of all integer bases b > 9 for which 17 b is a divisor of 97 b . Write Python code for a web application that visualizes real-time stock market data, including user authentication. Make it as efficient as possible. Thinking with tools and capabilities Thinking models work with all of Gemini's tools and capabilities. This allows the models to interact with external systems, execute code, or access real-time information, incorporating the results into their reasoning and final response. The search tool allows the model to query Google Search to find up-to-date information or information beyond its training data. This is useful for questions about recent events or highly specific topics. The code execution tool enables the model to generate and run Python code to perform calculations, manipulate data, or solve problems that are best handled algorithmically. The model receives the code's output and can use it in its response. With structured output , you can constrain Gemini to respond with JSON. This is particularly useful for integrating the model's output into applications. Function calling connects the thinking model to external tools and APIs, so it can reason about when to call the right function and what parameters to provide. URL Context provides the model with URLs as additional context for your prompt. The \ No newline at end of file diff --git a/docstore/af74702c-00fd-4855-95ca-970bb62f0f31 b/docstore/af74702c-00fd-4855-95ca-970bb62f0f31 new file mode 100644 index 0000000000000000000000000000000000000000..44d10ad654e1ae877f525afe1fc1f8db1da83e76 --- /dev/null +++ b/docstore/af74702c-00fd-4855-95ca-970bb62f0f31 @@ -0,0 +1 @@ +get_current_temperature ( location : str ) - > dict : """Gets the current temperature for a given location. Args: location: The city and state, e.g. San Francisco, CA Returns: A dictionary containing the temperature and unit. """ # ... (implementation) ... return { "temperature" : 25 , "unit" : "Celsius" } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_current_temperature ] ) # Pass the function itself # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What's the temperature in Boston?" , config = config , ) print ( response . text ) # The SDK handles the function call and returns the final text You can disable automatic function calling with: Python config = types . GenerateContentConfig ( tools = [ get_current_temperature ], automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ) ) Automatic function schema declaration Automatic schema extraction from Python functions doesn't work in all cases. For example, it doesn't handle cases where you describe the fields of a nested dictionary-object. The API is able to describe any of the following types: Python AllowedType = ( int | float | bool | str | list [ 'AllowedType' ] | dict [ str , AllowedType ]) To see what the inferred schema looks like, you can convert it using from_callable : Python def multiply ( a : float , b : float ): """Returns a * b.""" return a * b fn_decl = types . FunctionDeclaration . from_callable ( callable = multiply , client = client ) # to_json_dict() provides a clean JSON representation. print ( fn_decl . to_json_dict ()) Multi-tool use: Combine native tools with function calling You can enable multiple tools combining native tools with function calling at the same time. Here's an example that enables two tools, Grounding with Google Search and code execution , in a request using the Live API . Note: Multi-tool use is a- Live API only feature at the \ No newline at end of file diff --git a/docstore/af75d5c2-4d90-447e-942f-724051b7ce16 b/docstore/af75d5c2-4d90-447e-942f-724051b7ce16 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/af75d5c2-4d90-447e-942f-724051b7ce16 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/af7e5733-9f28-4cab-bfa1-fbefddb2979b b/docstore/af7e5733-9f28-4cab-bfa1-fbefddb2979b new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/af7e5733-9f28-4cab-bfa1-fbefddb2979b @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/af83b89b-e909-457b-a226-b4a620d91f60 b/docstore/af83b89b-e909-457b-a226-b4a620d91f60 new file mode 100644 index 0000000000000000000000000000000000000000..c3c7173dc6db245a4d264af187854bff02c98546 --- /dev/null +++ b/docstore/af83b89b-e909-457b-a226-b4a620d91f60 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#available-languages Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/af89375f-8b68-4708-876d-8c941222d2b8 b/docstore/af89375f-8b68-4708-876d-8c941222d2b8 new file mode 100644 index 0000000000000000000000000000000000000000..a97f8e9b4e6ce1d85f8c4b9bed31a9676bb7c602 --- /dev/null +++ b/docstore/af89375f-8b68-4708-876d-8c941222d2b8 @@ -0,0 +1 @@ +Files API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Files API The Gemini family of artificial intelligence (AI) models is built to handle various types of input data, including text, images, and audio. Since these models can handle more than one type or mode of data, the Gemini models are called multimodal models or explained as having multimodal capabilities . This guide shows you how to work with media files using the Files API. The basic operations are the same for audio files, images, videos, documents, and other supported file types. For file prompting guidance, check out the File prompt guide section. Upload a file You can use the Files API to upload a media file. Always use the Files API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20 MB. The following code uploads a file and then uses the file in a call to generateContent . Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp3" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ "Describe this audio clip" , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Describe this audio \ No newline at end of file diff --git a/docstore/afa903a8-8318-4a89-b70d-d499ed567449 b/docstore/afa903a8-8318-4a89-b70d-d499ed567449 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/afa903a8-8318-4a89-b70d-d499ed567449 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/b0133d2a-853d-4fb9-9302-50718cb56bc6 b/docstore/b0133d2a-853d-4fb9-9302-50718cb56bc6 new file mode 100644 index 0000000000000000000000000000000000000000..2dce4b1915975420243f156ab22de6a07e8b5cc9 --- /dev/null +++ b/docstore/b0133d2a-853d-4fb9-9302-50718cb56bc6 @@ -0,0 +1 @@ +The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/b01a74e2-665f-49d1-9c20-508e836173fe b/docstore/b01a74e2-665f-49d1-9c20-508e836173fe new file mode 100644 index 0000000000000000000000000000000000000000..af04e93a60e88c829b3aa07a575f51b8b1fa9ccf --- /dev/null +++ b/docstore/b01a74e2-665f-49d1-9c20-508e836173fe @@ -0,0 +1 @@ +TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : false , // default startOfSpeechSensitivity : StartSensitivity . START_SENSITIVITY_LOW , endOfSpeechSensitivity : EndSensitivity . END_SENSITIVITY_LOW , prefixPaddingMs : 20 , silenceDurationMs : 100 , } } }; Disable automatic VAD Alternatively, the automatic VAD can be disabled by setting realtimeInputConfig.automaticActivityDetection.disabled to true in the setup message. In this configuration the client is responsible for detecting user speech and sending activityStart and activityEnd messages at the appropriate times. An audioStreamEnd isn't sent in this configuration. Instead, any interruption of the stream is marked by an activityEnd message. Python config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : True }}, } async with client . aio . live . connect ( model = model , config = config ) as session : # ... await session . send_realtime_input ( activity_start = types . ActivityStart ()) await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) await session . send_realtime_input ( activity_end = types . ActivityEnd ()) # ... JavaScript const config = { responseModalities : [ Modality . TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : true , } } }; session . sendRealtimeInput ({ activityStart : {} }) session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); session . sendRealtimeInput ({ activityEnd : {} }) Token count You can find the total number of consumed tokens in the usageMetadata field of the returned server message. Python async for message in session . receive (): # The server will periodically send messages that include UsageMetadata. if message . usage_metadata : usage = message . usage_metadata print ( f "Used { usage . total_token_count } tokens in total. Response token \ No newline at end of file diff --git a/docstore/b02172bf-59e5-4d44-a92b-f18abbf6bace b/docstore/b02172bf-59e5-4d44-a92b-f18abbf6bace new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/b02172bf-59e5-4d44-a92b-f18abbf6bace @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/b040d6e7-d6a0-4eca-b1b1-5c70ae27c92a b/docstore/b040d6e7-d6a0-4eca-b1b1-5c70ae27c92a new file mode 100644 index 0000000000000000000000000000000000000000..a70be0d04647e3405f67cb87b3156d8922b7c210 --- /dev/null +++ b/docstore/b040d6e7-d6a0-4eca-b1b1-5c70ae27c92a @@ -0,0 +1 @@ +to the examples provides labels that the model can use when generating the output, which makes it easier to parse output content. In the following example, "Text:" is the input prefix and "The answer is:" is the output prefix. Prompt: Classify the text as one of the following categories. - large - small Text: Rhino The answer is: large Text: Mouse The answer is: small Text: Snail The answer is: small Text: Elephant The answer is: Response: The answer is: large (gemini-2.5-flash) Break down prompts into components For use cases that require complex prompts, you can help the model manage this complexity by breaking things down into simpler components. Break down instructions: Instead of having many instructions in one prompt, create one prompt per instruction. You can choose which prompt to process based on the user's input. Chain prompts: For complex tasks that involve multiple sequential steps, make each step a prompt and chain the prompts together in a sequence. In this sequential chain of prompts, the output of one prompt in the sequence becomes the input of the next prompt. The output of the last prompt in the sequence is the final output. Aggregate responses: Aggregation is when you want to perform different parallel tasks on different portions of the data and aggregate the results to produce the final output. For example, you can tell the model to perform one operation on the first part of the data, perform another operation on the rest of the data and aggregate the results. Experiment with model parameters Each call that you send to a model includes parameter values that control how the model generates a response. The model can generate different results for different parameter values. Experiment with different parameter values to get the best values for the task. The parameters available for different models may differ. The most common parameters are the following: Max output tokens: Specifies the maximum number of tokens that can be generated in the \ No newline at end of file diff --git a/docstore/b042b6eb-76f8-4625-98f3-38e13d6699eb b/docstore/b042b6eb-76f8-4625-98f3-38e13d6699eb new file mode 100644 index 0000000000000000000000000000000000000000..709a38c3bd077fba18fce9cb2b918f625c0b4e59 --- /dev/null +++ b/docstore/b042b6eb-76f8-4625-98f3-38e13d6699eb @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/libraries Title: Gemini API libraries | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/b04c6f8b-8e3f-4c97-bb1d-9b010a9ba44c b/docstore/b04c6f8b-8e3f-4c97-bb1d-9b010a9ba44c new file mode 100644 index 0000000000000000000000000000000000000000..2296aedd2799dc2d35c1f54c351a8c223662c6b9 --- /dev/null +++ b/docstore/b04c6f8b-8e3f-4c97-bb1d-9b010a9ba44c @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#gemini-1.5-flash-8b Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/b05f0a4f-aced-417f-b5ea-e2f6117548f0 b/docstore/b05f0a4f-aced-417f-b5ea-e2f6117548f0 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/b05f0a4f-aced-417f-b5ea-e2f6117548f0 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/b0691010-8e33-4b37-90f8-e6d0f6c257e1 b/docstore/b0691010-8e33-4b37-90f8-e6d0f6c257e1 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/b0691010-8e33-4b37-90f8-e6d0f6c257e1 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/b069bf4d-4acb-4dd7-9caa-82c6b33bb05b b/docstore/b069bf4d-4acb-4dd7-9caa-82c6b33bb05b new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/b069bf4d-4acb-4dd7-9caa-82c6b33bb05b @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/b06bad68-bc13-4164-a1d6-f14e91237fae b/docstore/b06bad68-bc13-4164-a1d6-f14e91237fae new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/b06bad68-bc13-4164-a1d6-f14e91237fae @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/b07e63d9-a3cc-4a7a-b884-78dd9a304322 b/docstore/b07e63d9-a3cc-4a7a-b884-78dd9a304322 new file mode 100644 index 0000000000000000000000000000000000000000..eb6db224edbdd160f04cb946308fd82587e98eec --- /dev/null +++ b/docstore/b07e63d9-a3cc-4a7a-b884-78dd9a304322 @@ -0,0 +1 @@ +marks Spain's record-breaking fourth European Championship title.[5]((https:/...), [2](https:/...), [3](https:/...), [4](https:/...) Pricing When you use Grounding with Google Search, your project is billed per API request that includes the google_search tool. If the model decides to execute multiple search queries to answer a single prompt (for example, searching for "UEFA Euro 2024 winner" and "Spain vs England Euro 2024 final score" within the same API call), this counts as a single billable use of the tool for that request. For detailed pricing information, see the Gemini API pricing page . Supported Models Experimental and Preview models are not included. You can find their capabilities on the model overview page. Model Grounding with Google Search Gemini 2.5 Pro ✔️ Gemini 2.5 Flash ✔️ Gemini 2.0 Flash ✔️ Gemini 1.5 Pro ✔️ Gemini 1.5 Flash ✔️ Note: Older models use a google_search_retrieval tool. For all current models, use the google_search tool as shown in the examples. Grounding with Gemini 1.5 Models (Legacy) While the google_search tool is recommended for Gemini 2.0 and later, Gemini 1.5 support a legacy tool named google_search_retrieval . This tool provides a dynamic mode that allows the model to decide whether to perform a search based on its confidence that the prompt requires fresh information. If the model's confidence is above a dynamic_threshold you set (a value between 0.0 and 1.0), it will perform a search. Python # Note: This is a legacy approach for Gemini 1.5 models. # The 'google_search' tool is recommended for all new development. import os from google import genai from google.genai import types client = genai . Client () retrieval_tool = types . Tool ( google_search_retrieval = types . GoogleSearchRetrieval ( dynamic_retrieval_config = types . DynamicRetrievalConfig ( mode = types . DynamicRetrievalConfigMode . MODE_DYNAMIC , dynamic_threshold = 0.7 # Only search if confidence > 70% ) ) ) config = types . GenerateContentConfig ( tools = [ \ No newline at end of file diff --git a/docstore/b08254c3-eea1-40d9-8be7-18a666461a77 b/docstore/b08254c3-eea1-40d9-8be7-18a666461a77 new file mode 100644 index 0000000000000000000000000000000000000000..eb9021b0c62d8225572ffa88a8581263b0a8ad78 --- /dev/null +++ b/docstore/b08254c3-eea1-40d9-8be7-18a666461a77 @@ -0,0 +1 @@ +and their capabilities, visit the Models page. Best practices Prompting tips For basic text generation, a zero-shot prompt often suffices without needing examples, system instructions or specific formatting. For more tailored outputs: Use System instructions to guide the model. Provide few example inputs and outputs to guide the model. This is often referred to as few-shot prompting. Consult our prompt engineering guide for more tips. Structured output In some cases, you may need structured output, such as JSON. Refer to our structured output guide to learn how. What's next Try the Gemini API getting started Colab . Explore Gemini's image , video , audio and document understanding capabilities. Learn about multimodal file prompting strategies . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/b084d94c-4ddb-4c23-a589-35053ef42e05 b/docstore/b084d94c-4ddb-4c23-a589-35053ef42e05 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/b084d94c-4ddb-4c23-a589-35053ef42e05 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/b09339f4-138d-452a-86b8-c571c2b14234 b/docstore/b09339f4-138d-452a-86b8-c571c2b14234 new file mode 100644 index 0000000000000000000000000000000000000000..ddc1ec68807ed0017d00c5153db6b826d6e2aced --- /dev/null +++ b/docstore/b09339f4-138d-452a-86b8-c571c2b14234 @@ -0,0 +1 @@ +"GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); async function main () { const model = await openai . models . retrieve ( "gemini-2.0-flash" ); console . log ( model . id ); } main (); REST curl https://generativelanguage.googleapis.com/v1beta/openai/models/gemini-2.0-flash \ -H "Authorization: Bearer GEMINI_API_KEY" Current limitations Support for the OpenAI libraries is still in beta while we extend feature support. If you have questions about supported parameters, upcoming features, or run into any issues getting started with Gemini, join our Developer Forum . What's next Try our OpenAI Compatibility Colab to work through more detailed examples. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-18 UTC. \ No newline at end of file diff --git a/docstore/b097a69b-11ef-4050-a0d9-1abaa18322cc b/docstore/b097a69b-11ef-4050-a0d9-1abaa18322cc new file mode 100644 index 0000000000000000000000000000000000000000..4ce25c45c0956235a2a76b8ce578fbaaad6010c8 --- /dev/null +++ b/docstore/b097a69b-11ef-4050-a0d9-1abaa18322cc @@ -0,0 +1 @@ +clip" , ]), }); console . log ( response . text ); } await main (); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } defer client . DeleteFile ( ctx , file . Name ) model := client . GenerativeModel ( "gemini-2.0-flash" ) resp , err := model . GenerateContent ( ctx , genai . FileData { URI : file . URI }, genai . Text ( "Describe this audio clip" )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) REST AUDIO_PATH = "path/to/sample.mp3" MIME_TYPE = $( file -b --mime-type " ${ AUDIO_PATH } " ) NUM_BYTES = $( wc -c < " ${ AUDIO_PATH } " ) DISPLAY_NAME = AUDIO tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D " ${ tmp_header_file } " \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ AUDIO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Describe this audio clip"}, {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": \ No newline at end of file diff --git a/docstore/b09dc8ef-e247-4798-a74e-d8c7dde04c87 b/docstore/b09dc8ef-e247-4798-a74e-d8c7dde04c87 new file mode 100644 index 0000000000000000000000000000000000000000..fc6b27785ffc2bef06b5f2067401216e3ff34ff1 --- /dev/null +++ b/docstore/b09dc8ef-e247-4798-a74e-d8c7dde04c87 @@ -0,0 +1 @@ +"cheeseburger" was excluded from the output because it wasn't a part of the order. Constraints Specify any constraints on reading the prompt or generating a response. You can tell the model what to do and not to do. For example, you can specify a constraint in the prompt on how long you want a summary to be: Prompt: Summarize this text in one sentence: Text: A quantum computer exploits quantum mechanical phenomena to perform calculations exponentially faster than any modern traditional computer. At very tiny scales, physical matter acts as both particles and as waves, and quantum computing uses specialized hardware to leverage this behavior. The operating principles of quantum devices is beyond the scope of classical physics. When deployed at scale, quantum computers could be used in a wide variety of applications such as: in cybersecurity to break existing encryption methods while helping researchers create new ones, in meteorology to develop better weather forecasting etc. However, the current state of the art quantum computers are still largely experimental and impractical. Response: Exploiting quantum mechanical phenomena, quantum computers can perform calculations exponentially faster than traditional computers for potential applications like cybersecurity and meteorology, although they are currently largely experimental and impractical. (gemini-2.5-flash) Response format You can give instructions that specify the format of the response. For example, you can ask for the response to be formatted as a table, bulleted list, elevator pitch, keywords, sentence, or paragraph. The following system instruction tells the model to be more conversational in its response: System instruction All questions should be answered comprehensively with details, unless the user requests a concise response specifically. Prompt What is a smart way to make a business that sells DVD's in 2025? Response: Starting a business selling DVDs in 2025 presents significant challenges due to the \ No newline at end of file diff --git a/docstore/b0ba0ef8-fb4b-4481-9bdf-1efce6ae135e b/docstore/b0ba0ef8-fb4b-4481-9bdf-1efce6ae135e new file mode 100644 index 0000000000000000000000000000000000000000..4b3d79fcf31020903c40df052c1807fd4a690d51 --- /dev/null +++ b/docstore/b0ba0ef8-fb4b-4481-9bdf-1efce6ae135e @@ -0,0 +1 @@ +like photography descriptors, shapes and materials, historical art movements, and image quality modifiers. Photography Prompt includes: "A photo of..." To use this style, start with using keywords that clearly tell Imagen that you're looking for a photograph. Start your prompts with "A photo of. . ." . For example: Prompt: A photo of coffee beans in a kitchen on a wooden surface Prompt: A photo of a chocolate bar on a kitchen counter Prompt: A photo of a modern building with water in the background Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Photography modifiers In the following examples, you can see several photography-specific modifiers and parameters. You can combine multiple modifiers for more precise control. Camera Proximity - Close up, taken from far away Prompt: A close-up photo of coffee beans Prompt: A zoomed out photo of a small bag of coffee beans in a messy kitchen Camera Position - aerial, from below Prompt: aerial photo of urban city with skyscrapers Prompt: A photo of a forest canopy with blue skies from below Lighting - natural, dramatic, warm, cold Prompt: studio photo of a modern arm chair, natural lighting Prompt: studio photo of a modern arm chair, dramatic lighting Camera Settings - motion blur, soft focus, bokeh, portrait Prompt: photo of a city with skyscrapers from the inside of a car with motion blur Prompt: soft focus photograph of a bridge in an urban city at night Lens types - 35mm, 50mm, fisheye, wide angle, macro Prompt: photo of a leaf, macro lens Prompt: street photography, new york city, fisheye lens Film types - black and white, polaroid Prompt: a polaroid portrait of a dog wearing sunglasses Prompt: black and white photo of a dog wearing sunglasses Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Illustration and art Prompt includes: "A painting of..." , "A sketch of..." Art styles vary from monochrome styles like pencil \ No newline at end of file diff --git a/docstore/b0cc640f-bf46-4480-a7f5-326b0fb4d70a b/docstore/b0cc640f-bf46-4480-a7f5-326b0fb4d70a new file mode 100644 index 0000000000000000000000000000000000000000..2b6e55e3ae415c04ff420e9e56413156ffa5e0fd --- /dev/null +++ b/docstore/b0cc640f-bf46-4480-a7f5-326b0fb4d70a @@ -0,0 +1 @@ +a sample rate of 24kHz. Python # Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav # Install helpers for converting files: pip install librosa soundfile import asyncio import io from pathlib import Path import wave from google import genai from google.genai import types import soundfile as sf import librosa client = genai . Client () # Half cascade model: # model = "gemini-live-2.5-flash-preview" # Native audio output model: model = "gemini-2.5-flash-preview-native-audio-dialog" config = { "response_modalities" : [ "AUDIO" ], "system_instruction" : "You are a helpful assistant and answer in a friendly tone." , } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : buffer = io . BytesIO () y , sr = librosa . load ( "sample.wav" , sr = 16000 ) sf . write ( buffer , y , sr , format = 'RAW' , subtype = 'PCM_16' ) buffer . seek ( 0 ) audio_bytes = buffer . read () # If already in correct format, you can use this: # audio_bytes = Path("sample.pcm").read_bytes() await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) wf = wave . open ( "audio.wav" , "wb" ) wf . setnchannels ( 1 ) wf . setsampwidth ( 2 ) wf . setframerate ( 24000 ) # Output is 24kHz async for response in session . receive (): if response . data is not None : wf . writeframes ( response . data ) # Un-comment this code to print audio data info # if response.server_content.model_turn is not None: # print(response.server_content.model_turn.parts[0].inline_data.mime_type) wf . close () if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; // npm install wavefile const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); // WARNING: Do not use API keys in \ No newline at end of file diff --git a/docstore/b0d87a3a-0a68-4afd-8ade-e329c5b4895f b/docstore/b0d87a3a-0a68-4afd-8ade-e329c5b4895f new file mode 100644 index 0000000000000000000000000000000000000000..ca9fcd920a86bdee4d0b622e9ecd16eba0587472 --- /dev/null +++ b/docstore/b0d87a3a-0a68-4afd-8ade-e329c5b4895f @@ -0,0 +1 @@ +[{"code_execution": {}}], "contents": [ { "role": "user", "parts": [{ "text": "Can you print \"Hello world!\"?" }] },{ "role": "model", "parts": [ { "text": "" }, { "executable_code": { "language": "PYTHON", "code": "\nprint(\"hello world!\")\n" } }, { "code_execution_result": { "outcome": "OUTCOME_OK", "output": "hello world!\n" } }, { "text": "I have printed \"hello world!\" using the provided python code block. \n" } ], },{ "role": "user", "parts": [{ "text": "What is the sum of the first 50 prime numbers? Generate and run code for the calculation, and make sure you get all 50." }] } ] }' Input/output (I/O) Starting with Gemini 2.0 Flash , code execution supports file input and graph output. Using these input and output capabilities, you can upload CSV and text files, ask questions about the files, and have Matplotlib graphs generated as part of the response. The output files are returned as inline images in the response. I/O pricing When using code execution I/O, you're charged for input tokens and output tokens: Input tokens: User prompt Output tokens: Code generated by the model Code execution output in the code environment Thinking tokens Summary generated by the model I/O details When you're working with code execution I/O, be aware of the following technical details: The maximum runtime of the code environment is 30 seconds. If the code environment generates an error, the model may decide to regenerate the code output. This can happen up to 5 times. The maximum file input size is limited by the model token window. In AI Studio, using Gemini Flash 2.0, the maximum input file size is 1 million tokens (roughly 2MB for text files of the supported input types). If you upload a file that's too large, AI Studio won't let you send it. Code execution works best with text and CSV files. The input file can be passed in part.inlineData or part.fileData (uploaded via the Files API ), and the output file is always returned as part.inlineData . Single turn Bidirectional \ No newline at end of file diff --git a/docstore/b0e4891c-02c2-4d5d-a8a0-0a46315f7ab0 b/docstore/b0e4891c-02c2-4d5d-a8a0-0a46315f7ab0 new file mode 100644 index 0000000000000000000000000000000000000000..a3f65441166ab1d39d6f723c5ce4c04b54ac95e6 --- /dev/null +++ b/docstore/b0e4891c-02c2-4d5d-a8a0-0a46315f7ab0 @@ -0,0 +1 @@ +values, use an enum. Tool Selection: While the model can use an arbitrary number of tools, providing too many can increase the risk of selecting an incorrect or suboptimal tool. For best results, aim to provide only the relevant tools for the context or task, ideally keeping the active set to a maximum of 10-20. Consider dynamic tool selection based on conversation context if you have a large total number of tools. Prompt Engineering: Provide context: Tell the model its role (e.g., "You are a helpful weather assistant."). Give instructions: Specify how and when to use functions (e.g., "Don't guess dates; always use a future date for forecasts."). Encourage clarification: Instruct the model to ask clarifying questions if needed. Temperature: Use a low temperature (e.g., 0) for more deterministic and reliable function calls. Validation: If a function call has significant consequences (e.g., placing an order), validate the call with the user before executing it. Error Handling : Implement robust error handling in your functions to gracefully handle unexpected inputs or API failures. Return informative error messages that the model can use to generate helpful responses to the user. Security: Be mindful of security when calling external APIs. Use appropriate authentication and authorization mechanisms. Avoid exposing sensitive data in function calls. Token Limits: Function descriptions and parameters count towards your input token limit. If you're hitting token limits, consider limiting the number of functions or the length of the descriptions, break down complex tasks into smaller, more focused function sets. Notes and limitations Only a subset of the OpenAPI schema is supported. Supported parameter types in Python are limited. Automatic function calling is a Python SDK feature only. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License \ No newline at end of file diff --git a/docstore/b0ebc713-0f76-480c-8b83-22ef3a02519c b/docstore/b0ebc713-0f76-480c-8b83-22ef3a02519c new file mode 100644 index 0000000000000000000000000000000000000000..8759a03a20a3177c7734cd1638fb9c60e8d9d57e --- /dev/null +++ b/docstore/b0ebc713-0f76-480c-8b83-22ef3a02519c @@ -0,0 +1 @@ +popularized by short form video apps (for example, YouTube shorts). Use this for tall objects with strong vertical orientations such as buildings, trees, waterfalls, or other similar objects. Prompt: a digital render of a massive skyscraper, modern, grand, epic with a beautiful sunset in the background (9:16 aspect ratio) Photorealistic images Different versions of the image generation model might offer a mix of artistic and photorealistic output. Use the following wording in prompts to generate more photorealistic output, based on the subject you want to generate. Note: Take these keywords as general guidance when you try to create photorealistic images. They aren't required to achieve your goal. Use case Lens type Focal lengths Additional details People (portraits) Prime, zoom 24-35mm black and white film, Film noir, Depth of field, duotone (mention two colors) Food, insects, plants (objects, still life) Macro 60-105mm High detail, precise focusing, controlled lighting Sports, wildlife (motion) Telephoto zoom 100-400mm Fast shutter speed, Action or movement tracking Astronomical, landscape (wide-angle) Wide-angle 10-24mm Long exposure times, sharp focus, long exposure, smooth water or clouds Portraits Use case Lens type Focal lengths Additional details People (portraits) Prime, zoom 24-35mm black and white film, Film noir, Depth of field, duotone (mention two colors) Using several keywords from the table, Imagen can generate the following portraits: Prompt: A woman, 35mm portrait, blue and grey duotones Model: imagen-3.0-generate-002 Prompt: A woman, 35mm portrait, film noir Model: imagen-3.0-generate-002 Objects Use case Lens type Focal lengths Additional details Food, insects, plants (objects, still life) Macro 60-105mm High detail, precise focusing, controlled lighting Using several keywords from the table, Imagen can generate the following object images: Prompt: leaf of a prayer plant, macro lens, 60mm Model: imagen-3.0-generate-002 Prompt: a plate of pasta, \ No newline at end of file diff --git a/docstore/b0f179c4-ac1d-46df-a15a-a90469ffd9e6 b/docstore/b0f179c4-ac1d-46df-a15a-a90469ffd9e6 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/b0f179c4-ac1d-46df-a15a-a90469ffd9e6 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/b10a24ae-ed82-4380-943a-09190f4fc878 b/docstore/b10a24ae-ed82-4380-943a-09190f4fc878 new file mode 100644 index 0000000000000000000000000000000000000000..8ae6e7f8faf81825a858a0fb6935f6e2a3e8dc09 --- /dev/null +++ b/docstore/b10a24ae-ed82-4380-943a-09190f4fc878 @@ -0,0 +1 @@ +help with that, as I'm only a language model." If the model responds with a fallback response, try increasing the temperature. Things to avoid Avoid relying on models to generate factual information. Use with care on math and logic problems. Generative models under the hood This section aims to answer the question - Is there randomness in generative models' responses, or are they deterministic? The short answer - yes to both. When you prompt a generative model, a text response is generated in two stages. In the first stage, the generative model processes the input prompt and generates a probability distribution over possible tokens (words) that are likely to come next. For example, if you prompt with the input text "The dog jumped over the ... ", the generative model will produce an array of probable next words: [("fence", 0.77), ("ledge", 0.12), ("blanket", 0.03), ...] This process is deterministic; a generative model will produce this same distribution every time it's input the same prompt text. In the second stage, the generative model converts these distributions into actual text responses through one of several decoding strategies. A simple decoding strategy might select the most likely token at every timestep. This process would always be deterministic. However, you could instead choose to generate a response by randomly sampling over the distribution returned by the model. This process would be stochastic (random). Control the degree of randomness allowed in this decoding process by setting the temperature. A temperature of 0 means only the most likely tokens are selected, and there's no randomness. Conversely, a high temperature injects a high degree of randomness into the tokens selected by the model, leading to more unexpected, surprising model responses. Next steps Now that you have a deeper understanding of prompt design, try writing your own prompts using Google AI Studio . To learn about multimodal prompting, see Prompting with media files . To learn \ No newline at end of file diff --git a/docstore/b1120567-ba3b-4767-b17c-d9e6e39b9da9 b/docstore/b1120567-ba3b-4767-b17c-d9e6e39b9da9 new file mode 100644 index 0000000000000000000000000000000000000000..ba28dbc7ce7bec083cb930373e6fd594666aa933 --- /dev/null +++ b/docstore/b1120567-ba3b-4767-b17c-d9e6e39b9da9 @@ -0,0 +1 @@ +my house?" , }); console . log ( "Chat response 2:" , response2 . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) res , _ := chat . SendMessage ( ctx , genai . Part { Text : "How many paws are in my house?" }) if len ( res . Candidates ) > 0 { fmt . Println ( res . Candidates [ 0 ]. Content . Parts [ 0 ]. Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' \ No newline at end of file diff --git a/docstore/b1234894-34ad-4110-a03e-31ccaaffeaa4 b/docstore/b1234894-34ad-4110-a03e-31ccaaffeaa4 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/b1234894-34ad-4110-a03e-31ccaaffeaa4 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/b155f830-a25f-4cb8-9281-321f5131b803 b/docstore/b155f830-a25f-4cb8-9281-321f5131b803 new file mode 100644 index 0000000000000000000000000000000000000000..3d0efcbd852506bcdcffe96143ffad9326aef9eb --- /dev/null +++ b/docstore/b155f830-a25f-4cb8-9281-321f5131b803 @@ -0,0 +1 @@ +from google import genai client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'Tell me a story in 300 words.' ) print ( response . text ) print ( response . model_dump_json ( exclude_none = True , indent = 4 )) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Tell me a story in 300 words." , }); console . log ( response . text ); Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , genai . Text ( "Tell me a story in 300 words." ), nil ) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing result Image Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ([ 'Tell me a story based on this image' , Image . open ( image_path ) ]) print ( response . text ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); function fileToGenerativePart ( path , mimeType ) { return { inlineData : { data : Buffer . from ( fs . readFileSync ( path )). toString ( "base64" ), mimeType , }, }; } const prompt = "Tell me a story based on this image" ; const imagePart = fileToGenerativePart ( `path/to/organ.jpg` , "image/jpeg" , ); const result = await model . generateContent ([ prompt , imagePart ]); console . log ( result . response . text ()); Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( \ No newline at end of file diff --git a/docstore/b15ef614-7f8f-4685-a7d7-4169087b353c b/docstore/b15ef614-7f8f-4685-a7d7-4169087b353c new file mode 100644 index 0000000000000000000000000000000000000000..56f7dcc9bceb6e3744499a44a4d77c57b76426e8 --- /dev/null +++ b/docstore/b15ef614-7f8f-4685-a7d7-4169087b353c @@ -0,0 +1 @@ += "gemini-2.5-flash" , contents = [ "Explain how AI works" ], config = types . GenerateContentConfig ( temperature = 0.1 ) ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , config : { temperature : 0.1 , }, }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } temp := float32 ( 0.9 ) topP := float32 ( 0.5 ) topK := float32 ( 20.0 ) config := & genai . GenerateContentConfig { Temperature : & temp , TopP : & topP , TopK : & topK , ResponseMIMEType : "application/json" , } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "What is the average size of a swallow?" ), config , ) fmt . Println ( result . Text ()) } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ], "generationConfig": { "stopSequences": [ "Title" ], "temperature": 1.0, "topP": 0.8, "topK": 10 } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const generationConfig = { temperature : 1 , topP : 0.95 , topK : 40 , responseMimeType : 'text/plain' , }; const payload = { generationConfig , contents : [ { parts : [ { text : 'Explain how AI works in a few words' }, ], }, ], }; const url = \ No newline at end of file diff --git a/docstore/b15f8ade-2673-4ef4-a8e2-02f440bb6866 b/docstore/b15f8ade-2673-4ef4-a8e2-02f440bb6866 new file mode 100644 index 0000000000000000000000000000000000000000..4a8a7222dfc27acfaa73b21a084913914a78851b --- /dev/null +++ b/docstore/b15f8ade-2673-4ef4-a8e2-02f440bb6866 @@ -0,0 +1 @@ +"fmt" "google.golang.org/genai" "os" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := genai . Text ( "What is the sum of the first 50 prime numbers?" ) model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { IncludeThoughts : true , }, }) for _ , part := range resp . Candidates [ 0 ]. Content . Parts { if part . Text != "" { if part . Thought { fmt . Println ( "Thoughts Summary:" ) fmt . Println ( part . Text ) } else { fmt . Println ( "Answer:" ) fmt . Println ( part . Text ) } } } } And here is an example using thinking with streaming, which returns rolling, incremental summaries during generation: Python from google import genai from google.genai import types client = genai . Client () prompt = """ Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own? """ thoughts = "" answer = "" for chunk in client . models . generate_content_stream ( model = "gemini-2.5-pro" , contents = prompt , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( include_thoughts = True ) ) ): for part in chunk . candidates [ 0 ] . content . parts : if not part . text : continue elif part . thought : if not thoughts : print ( "Thoughts summary:" ) print ( part . text ) thoughts += part . text else : if not answer : print ( "Answer:" ) print ( part . text ) answer += part . text JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const prompt = `Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The \ No newline at end of file diff --git a/docstore/b181e328-6334-4d2c-b645-a761881743ab b/docstore/b181e328-6334-4d2c-b645-a761881743ab new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/b181e328-6334-4d2c-b645-a761881743ab @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/b18529b8-44e7-43a7-b8a7-98e95dc62924 b/docstore/b18529b8-44e7-43a7-b8a7-98e95dc62924 new file mode 100644 index 0000000000000000000000000000000000000000..b23be391a6df64a9ced86d5078afab15427d3d3f --- /dev/null +++ b/docstore/b18529b8-44e7-43a7-b8a7-98e95dc62924 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/b1cd2d25-6290-4a91-8bc9-cab248a28e3c b/docstore/b1cd2d25-6290-4a91-8bc9-cab248a28e3c new file mode 100644 index 0000000000000000000000000000000000000000..1d2463b6c11af951d3bab4a46bb6e7601785f7d8 --- /dev/null +++ b/docstore/b1cd2d25-6290-4a91-8bc9-cab248a28e3c @@ -0,0 +1 @@ +'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works in a few words" } ] } ] }' Meet the models Use Gemini in Google AI Studio 2.5 Pro spark Our most powerful thinking model with features for complex reasoning and much more 2.5 Flash spark Our newest multimodal model, with next generation features and improved capabilities 2.5 Flash-Lite spark Our fastest and most cost-efficient multimodal model with great performance for high-frequency tasks Explore the API Native Image Generation Generate and edit highly contextual images natively with Gemini 2.0 Flash. Explore long context Input millions of tokens to Gemini models and derive understanding from unstructured images, videos, and documents. Generate structured outputs Constrain Gemini to respond with JSON, a structured data format suitable for automated processing. Start building with the Gemini API Get started Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/b1e704c8-55f8-485c-95dd-c2afc21f5f3f b/docstore/b1e704c8-55f8-485c-95dd-c2afc21f5f3f new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/b1e704c8-55f8-485c-95dd-c2afc21f5f3f @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/b2201eee-00cb-40dd-9e16-b519fc3092aa b/docstore/b2201eee-00cb-40dd-9e16-b519fc3092aa new file mode 100644 index 0000000000000000000000000000000000000000..2c0be12e5de5e086f593ed916f2fdecc774a45a4 --- /dev/null +++ b/docstore/b2201eee-00cb-40dd-9e16-b519fc3092aa @@ -0,0 +1 @@ +Text generation | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Text generation The Gemini API can generate text output from various inputs, including text, images, video, and audio, leveraging Gemini models. Here's a basic example that takes a single text input: Python from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents \ No newline at end of file diff --git a/docstore/b26b12b0-b7a8-4811-8112-b717b0f7d5b7 b/docstore/b26b12b0-b7a8-4811-8112-b717b0f7d5b7 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/b26b12b0-b7a8-4811-8112-b717b0f7d5b7 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/b2796215-f99c-4ac7-9a27-e9623bfe436b b/docstore/b2796215-f99c-4ac7-9a27-e9623bfe436b new file mode 100644 index 0000000000000000000000000000000000000000..35cb4724ff25e15eeabaea84a051d2545b73055e --- /dev/null +++ b/docstore/b2796215-f99c-4ac7-9a27-e9623bfe436b @@ -0,0 +1 @@ +model to produce concise responses, you can include examples in the prompt that give preference to concise responses. The following prompt provides two examples that show preference to the shorter explanations. In the response, you can see that the examples guided the model to choose the shorter explanation ( Explanation2 ) as opposed to the longer explanation ( Explanation1 ) like it did previously. Prompt: Below are some examples showing a question, explanation, and answer format: Question: Why is the sky blue? Explanation1: The sky appears blue because of Rayleigh scattering, which causes shorter blue wavelengths of light to be scattered more easily than longer red wavelengths, making the sky look blue. Explanation2: Due to Rayleigh scattering effect. Answer: Explanation2 Question: What is the cause of earthquakes? Explanation1: Sudden release of energy in the Earth's crust. Explanation2: Earthquakes happen when tectonic plates suddenly slip or break apart, causing a release of energy that creates seismic waves that can shake the ground and cause damage. Answer: Explanation1 Now, Answer the following question given the example formats above: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Answer: Explanation2 (gemini-2.5-flash) Optimal number of examples Models like Gemini can often pick up on patterns using a few examples, though you may need to experiment with the number of examples to provide in the prompt for the best results. At the same time, if you include too many examples, the model may start to overfit the response to the examples. Patterns vs anti patterns Using examples to show the model a pattern to follow is more effective than using examples to show the model an anti pattern \ No newline at end of file diff --git a/docstore/b29a3453-8435-426a-b198-569d058fd555 b/docstore/b29a3453-8435-426a-b198-569d058fd555 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/b29a3453-8435-426a-b198-569d058fd555 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/b2c5288e-46e7-4cb8-a60e-21ecdd5cf66e b/docstore/b2c5288e-46e7-4cb8-a60e-21ecdd5cf66e new file mode 100644 index 0000000000000000000000000000000000000000..6d0fb3dda30537d366b10ce141412a20984aa796 --- /dev/null +++ b/docstore/b2c5288e-46e7-4cb8-a60e-21ecdd5cf66e @@ -0,0 +1 @@ +where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels. """ config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # set thinking_budget to 0 for better results in object detection ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ prompt , im ], # Pillow images can be directly passed as inputs (which will be converted by the SDK) config = config ) # Parse JSON response items = json . loads ( parse_json ( response . text )) # Create output directory os . makedirs ( output_dir , exist_ok = True ) # Process each mask for i , item in enumerate ( items ): # Get bounding box coordinates box = item [ "box_2d" ] y0 = int ( box [ 0 ] / 1000 * im . size [ 1 ]) x0 = int ( box [ 1 ] / 1000 * im . size [ 0 ]) y1 = int ( box [ 2 ] / 1000 * im . size [ 1 ]) x1 = int ( box [ 3 ] / 1000 * im . size [ 0 ]) # Skip invalid boxes if y0 > = y1 or x0 > = x1 : continue # Process mask png_str = item [ "mask" ] if not png_str . startswith ( "data:image/png;base64," ): continue # Remove prefix png_str = png_str . removeprefix ( "data:image/png;base64," ) mask_data = base64 . b64decode ( png_str ) mask = Image . open ( io . BytesIO ( mask_data )) # Resize mask to match bounding box mask = mask . resize (( x1 - x0 , y1 - y0 ), Image . Resampling . BILINEAR ) # Convert mask to numpy array for processing mask_array = np . array ( mask ) # Create overlay for this mask overlay = Image . new ( 'RGBA' , im . size , ( 0 , 0 , 0 , 0 )) overlay_draw = ImageDraw . Draw ( overlay ) # Create overlay for the mask color = ( 255 , 255 , 255 , 200 ) for y in range ( y0 , y1 ): for x in range ( x0 , x1 ): if mask_array [ y - y0 , x - x0 ] > 128 : # Threshold for mask overlay_draw . point (( x , y ), fill = color ) # Save individual mask and its overlay mask_filename = f " { item [ 'label' ] } _ { i } _mask.png" \ No newline at end of file diff --git a/docstore/b2d05a0e-3631-4a07-bb60-f0177fcad35b b/docstore/b2d05a0e-3631-4a07-bb60-f0177fcad35b new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/b2d05a0e-3631-4a07-bb60-f0177fcad35b @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/b2e678fb-e08f-4a37-817c-f4d2493d0a3c b/docstore/b2e678fb-e08f-4a37-817c-f4d2493d0a3c new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/b2e678fb-e08f-4a37-817c-f4d2493d0a3c @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/b2efdf0c-b2aa-456c-95df-4d6f643c5d17 b/docstore/b2efdf0c-b2aa-456c-95df-4d6f643c5d17 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/b2efdf0c-b2aa-456c-95df-4d6f643c5d17 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/b30ba261-1dbf-4e68-b25e-677361ecd50a b/docstore/b30ba261-1dbf-4e68-b25e-677361ecd50a new file mode 100644 index 0000000000000000000000000000000000000000..34a11694c4ec18846b4474603e1c63c04851d790 --- /dev/null +++ b/docstore/b30ba261-1dbf-4e68-b25e-677361ecd50a @@ -0,0 +1 @@ +function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'When did the last Brazil vs. Argentina soccer match happen?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . modelTurn && turn . serverContent . modelTurn . parts ) { for ( const part of turn . serverContent . modelTurn . parts ) { if ( part . text ) { console . debug ( 'Received text: %s\n' , part . text ); } else if ( part . executableCode ) { console . debug ( 'executableCode: %s\n' , part . executableCode . code ); } else if ( part . codeExecutionResult ) { console . debug ( 'codeExecutionResult: %s\n' , part . codeExecutionResult . output ); } } } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Combining multiple tools You can combine multiple tools within the Live API, increasing your application's capabilities even more: Python prompt = """ Hey, I need you to do three things for me. 1. Compute the largest prime palindrome under 100000. 2. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024? 3. Turn on the lights Thanks! """ tools = [ { "google_search" : {}}, { "code_execution" : {}}, { "function_declarations" : [ turn_on_the_lights , turn_off_the_lights ]}, ] config = { "response_modalities" : [ "TEXT" ], "tools" : tools } # ... remaining model call JavaScript const prompt = `Hey, I need you to do three things for me. 1. Compute the largest prime palindrome under 100000. 2. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024? 3. Turn on the lights Thanks! ` const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ \ No newline at end of file diff --git a/docstore/b3260ddd-2d2a-4f94-ba35-f6608c8af692 b/docstore/b3260ddd-2d2a-4f94-ba35-f6608c8af692 new file mode 100644 index 0000000000000000000000000000000000000000..ba28dbc7ce7bec083cb930373e6fd594666aa933 --- /dev/null +++ b/docstore/b3260ddd-2d2a-4f94-ba35-f6608c8af692 @@ -0,0 +1 @@ +my house?" , }); console . log ( "Chat response 2:" , response2 . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) res , _ := chat . SendMessage ( ctx , genai . Part { Text : "How many paws are in my house?" }) if len ( res . Candidates ) > 0 { fmt . Println ( res . Candidates [ 0 ]. Content . Parts [ 0 ]. Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' \ No newline at end of file diff --git a/docstore/b346d524-8c8a-4b51-8e17-2b1d4faf9f46 b/docstore/b346d524-8c8a-4b51-8e17-2b1d4faf9f46 new file mode 100644 index 0000000000000000000000000000000000000000..4403c8e8ebca16251f4875b8e14907f4412efbd1 --- /dev/null +++ b/docstore/b346d524-8c8a-4b51-8e17-2b1d4faf9f46 @@ -0,0 +1 @@ +"role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Hello!" } ], stream = True ) for chunk in response : print ( chunk . choices [ 0 ] . delta ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function main () { const completion = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : [ { "role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Hello!" } ], stream : true , }); for await ( const chunk of completion ) { console . log ( chunk . choices [ 0 ]. delta . content ); } } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.0-flash", "messages": [ {"role": "user", "content": "Explain to me how AI works"} ], "stream": true }' Function calling Function calling makes it easier for you to get structured data outputs from generative models and is supported in the Gemini API . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) tools = [ { "type" : "function" , "function" : { "name" : "get_weather" , "description" : "Get the weather in a given location" , "parameters" : { "type" : "object" , "properties" : { "location" : { "type" : "string" , "description" : "The city and state, e.g. Chicago, IL" , }, "unit" : { "type" : "string" , "enum" : [ "celsius" , "fahrenheit" ]}, }, "required" : [ "location" ], }, } } ] messages = [{ "role" : "user" , "content" : "What's the weather like in Chicago today?" }] response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = messages , tools = tools , tool_choice = "auto" ) print ( response ) JavaScript import \ No newline at end of file diff --git a/docstore/b35e86aa-e390-48af-bc79-866aa4d632ed b/docstore/b35e86aa-e390-48af-bc79-866aa4d632ed new file mode 100644 index 0000000000000000000000000000000000000000..deed43be9d78353ae146822eb2d40897035c76a7 --- /dev/null +++ b/docstore/b35e86aa-e390-48af-bc79-866aa4d632ed @@ -0,0 +1 @@ +"What other color sofas would work in my space? can you update the image?" Multi-turn image editing (chat): Keep generating / editing images conversationally. Example prompts: [upload an image of a blue car.] , "Turn this car into a convertible.", "Now change the color to yellow." Limitations For best performance, use the following languages: EN, es-MX, ja-JP, zh-CN, hi-IN. Image generation does not support audio or video inputs. Image generation may not always trigger: The model may output text only. Try asking for image outputs explicitly (e.g. "generate an image", "provide images as you go along", "update the image"). The model may stop generating partway through. Try again or try a different prompt. When generating text for an image, Gemini works best if you first generate the text and then ask for an image with the text. There are some regions/countries where Image generation is not available. See Models for more information. Generate images using the Imagen models This example demonstrates generating images with an Imagen model : Python from google import genai from google.genai import types from PIL import Image from io import BytesIO client = genai . Client () response = client . models . generate_images ( model = 'imagen-4.0-generate-preview-06-06' , prompt = 'Robot holding a red skateboard' , config = types . GenerateImagesConfig ( number_of_images = 4 , ) ) for generated_image in response . generated_images : generated_image . image . show () JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . generateImages ({ model : 'imagen-4.0-generate-preview-06-06' , prompt : 'Robot holding a red skateboard' , config : { numberOfImages : 4 , }, }); let idx = 1 ; for ( const generatedImage of response . generatedImages ) { let imgBytes = generatedImage . image . imageBytes ; const buffer = Buffer . from ( imgBytes , "base64" ); fs . \ No newline at end of file diff --git a/docstore/b35fb6a9-5c77-4cc0-9df3-dcceb5e08515 b/docstore/b35fb6a9-5c77-4cc0-9df3-dcceb5e08515 new file mode 100644 index 0000000000000000000000000000000000000000..9b0480f81615786d4da1611ac72ab96b5af43602 --- /dev/null +++ b/docstore/b35fb6a9-5c77-4cc0-9df3-dcceb5e08515 @@ -0,0 +1 @@ +"CiIBVKhc7oDPpCaXyJKKssjqr4g3JNOSgJ/M2V+1THC1icsWCmwBVKhc7pBABbZ+zR3e9234WnWWS6GFXmf8IVwpnzjd5KYd7vyJbn/4vTorWBGayj/vbd9JPaZQjxdAIXhoE5mX/MDsQ7M9N/b0qJjHm39tYIBvS4sIWkMDHqTJqXGLzhhKtrTkfbV3RbaJEkQKmwEBVKhc7qVUgC3hfTXZLo9R3AJzUUIx50NKvJTb9B+UU+LBqgg7Nck1x5OpjWVS2R+SsveprIuYOruk2Y0H53J2OJF8qsxTdIq2si8DGW2V7WK8xyoJH5kbqd7drIw1jLb44b6lx4SMyB0VaULuTBki4d+Ljjg1tJTwR0IYMKqDLDZt9mheINsi0ZxcNjfpnDydRXdWbcSwzmK/wgqJAQFUqFzuKgNVElxs3cbO+xebr2IwcOro84nKTisi0tTp9bICPC9fTUhn3L+rvQWA+d3J1Za8at2bakrqiRj7BTh+CVO9fWQMAEQAs3ni0Z2hfaYG92tOD26E4IoZwyYEoWbfNudpH1fr5tEkyqnEGtWIh7H+XoZQ2DXeiOa+br7Zk88SrNE+trJMCogBAVSoXO5e9fBLg7hnbkmKsrzNLnQtLsQm1gNzjcjEC7nJYklYPp0KI2uGBE1PkM8XNsfllAfHVn7LzHcHNlbQ9pJ7QZTSIeG42goS971r5wNZwxaXwCTphClQh826eqJWo6A/28TtAVQWLhTx5ekbP7qb4nh1UblESZ1saxDQAEo4OKPbDzx5BgqKAQFUqFzuVyjNm5i0wN8hTDnKjfpDroEpPPTs531iFy9BOX+xDCdGHy8D+osFpaoBq6TFekQQbz4hIoUR1YEcP4zI80/cNimEeb9IcFxZTTxiNrbhbbcv0969DSMWhB+ZEqIz4vuw4GLe/xcUvqhlChQwFdgIbdOQHSHpatn5uDlktnP/bi26nKuXIwo0AVSoXO7US22OUH7d1f4abNPI0IyAvhqkPp12rbtWLx9vkOtojE8IP+xCfYtIFuZIzRNZqA==" } ] , "role" : "model" } , { "role" : "user" , "parts" : [ { "functionResponse" : { "name" : "getWeather" , "response" : { "response" : { "stringValue" : "Sunny and hot. 90 degrees Fahrenheit" } } } } ] } ] , # Remainder of request... Learn more about limitations and usage of thought signatures, and about thinking models in general, on the Thinking page. Parallel function calling In addition to single turn function calling, you can also call multiple functions at once. Parallel function calling lets you execute multiple functions at once and is used when the functions are not dependent on each other. This is useful in scenarios like gathering data from multiple independent sources, such as retrieving customer details from different databases or checking inventory levels across various warehouses or performing multiple actions such as converting your apartment into a disco. Python power_disco_ball = { "name" : "power_disco_ball" , "description" \ No newline at end of file diff --git a/docstore/b378b514-da8b-4104-9601-4ea8349b1e5c b/docstore/b378b514-da8b-4104-9601-4ea8349b1e5c new file mode 100644 index 0000000000000000000000000000000000000000..a1daa64b7091fa9e5594d9222d90d8c8ad6521d2 --- /dev/null +++ b/docstore/b378b514-da8b-4104-9601-4ea8349b1e5c @@ -0,0 +1 @@ +). You can listen to all the voices in AI Studio . To specify a voice, set the voice name within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "voice_config" : { "prebuilt_voice_config" : { "voice_name" : "Kore" }} }, } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" } } } }; Note: If you're using the generateContent API, the set of available voices is slightly different. See the audio generation guide for generateContent audio generation voices. The Live API supports multiple languages . To change the language, set the language code within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "language_code" : "de-DE" } } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { languageCode : "de-DE" } }; Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Native audio capabilities The following capabilities are only available with native audio. You can learn more about native audio in Choose a model and audio generation . Note: Native audio models currently have limited tool use support. See Overview of supported tools for details. How to use native audio output To use native audio output, configure one of the native audio models and set response_modalities to AUDIO . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-preview-native-audio-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio . live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-preview-native-audio-dialog' ; const config = { responseModalities : [ \ No newline at end of file diff --git a/docstore/b37e50d0-0d1f-4761-aa05-bc41d9f4d502 b/docstore/b37e50d0-0d1f-4761-aa05-bc41d9f4d502 new file mode 100644 index 0000000000000000000000000000000000000000..6acfe7ba02c229c83724a8d4179d85fae6727e02 --- /dev/null +++ b/docstore/b37e50d0-0d1f-4761-aa05-bc41d9f4d502 @@ -0,0 +1 @@ +images before performing its reasoning task. Prompt Model response What is in common between these images? All four images are examples of landscape photography. Updated prompt Improved response First, describe what's in each image in detail. What's in common between these images? The first image shows a person packing a suitcase. The second image shows an empty hallway in a shopping mall. The third image shows a roll of toilet paper. The fourth image shows a dark and stormy sea. The commonality between these images is that they are all examples of liminal spaces. Liminal spaces are places or situations that are in between two other places or situations. They are often characterized by a sense of uncertainty or ambiguity. The first image shows the person in between being at home and being on a trip. The second image shows the hallway in between two stores. The third image is of toilet paper, which can represent a bathroom. A bathroom is a transitional space between private and public realms. It's a place where we prepare ourselves to enter or leave the outside world. The fourth image shows the sea in between being calm and being stormy. Liminal spaces can be both exciting and unsettling. They can be a place of new beginning or a place of endings. They can be a place of reflection or a place of action. You can also try explicitly asking the model to refer to what's in the image. Updated prompt Improved response What is in common between these images? Refer to what's in the images in your response. All of the images are related to travel. The first image shows a person packing a suitcase, the second image shows an empty hallway in a shopping mall, the third image shows a stack of toilet paper, and the fourth image shows a dark and stormy sea. These images are all related to travel because they are all things that people need or use when they travel. The first image shows a person packing a suitcase, which is something that people do when they are going on a trip. The \ No newline at end of file diff --git a/docstore/b38dce85-513c-47fa-a5a4-4c72dd094238 b/docstore/b38dce85-513c-47fa-a5a4-4c72dd094238 new file mode 100644 index 0000000000000000000000000000000000000000..d8377d018b3db5f7ee9855aef65f187ecbbeaa66 --- /dev/null +++ b/docstore/b38dce85-513c-47fa-a5a4-4c72dd094238 @@ -0,0 +1 @@ +print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works in a few words" , config : { thinkingConfig : { thinkingBudget : 0 , // Disables thinking }, } }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : int32 ( 0 ), // Disables thinking }, } ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works in a few words" } ] } ] "generationConfig": { "thinkingConfig": { "thinkingBudget": 0 } } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'Explain how AI works in a few words' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ \ No newline at end of file diff --git a/docstore/b38de8e3-2082-4aa0-8271-d2942840c85c b/docstore/b38de8e3-2082-4aa0-8271-d2942840c85c new file mode 100644 index 0000000000000000000000000000000000000000..4a588ba82271aaa08f2df12295049c23bd57f7eb --- /dev/null +++ b/docstore/b38de8e3-2082-4aa0-8271-d2942840c85c @@ -0,0 +1 @@ +, ) print ( response . text ) # The SDK handles the function call and returns the final text You can disable automatic function calling with: Python config = types . GenerateContentConfig ( tools = [ get_current_temperature ], automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ) ) Automatic function schema declaration Automatic schema extraction from Python functions doesn't work in all cases. For example, it doesn't handle cases where you describe the fields of a nested dictionary-object. The API is able to describe any of the following types: Python AllowedType = ( int | float | bool | str | list [ 'AllowedType' ] | dict [ str , AllowedType ]) To see what the inferred schema looks like, you can convert it using from_callable : Python def multiply ( a : float , b : float ): """Returns a * b.""" return a * b fn_decl = types . FunctionDeclaration . from_callable ( callable = multiply , client = client ) # to_json_dict() provides a clean JSON representation. print ( fn_decl . to_json_dict ()) Multi-tool use: Combine native tools with function calling You can enable multiple tools combining native tools with function calling at the same time. Here's an example that enables two tools, Grounding with Google Search and code execution , in a request using the Live API . Note: Multi-tool use is a- Live API only feature at the moment. The run() function declaration, which handles the asynchronous websocket setup, is omitted for brevity. Python # Multiple tasks example - combining lights, code execution, and search prompt = """ Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! """ tools = [ { 'google_search' : {}}, { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} # not defined here. \ No newline at end of file diff --git a/docstore/b39170ed-e299-4ed4-a61c-c39e4fb85746 b/docstore/b39170ed-e299-4ed4-a61c-c39e4fb85746 new file mode 100644 index 0000000000000000000000000000000000000000..3c7819093774ca7626711ba692d14925f51fa93f --- /dev/null +++ b/docstore/b39170ed-e299-4ed4-a61c-c39e4fb85746 @@ -0,0 +1 @@ +text embeddings: Python from google import genai client = genai . Client () result = client . models . embed_content ( model = "gemini-embedding-exp-03-07" , contents = "What is the meaning of life?" ) print ( result . embeddings ) JavaScript import { GoogleGenAI } from "@google/genai" ; async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . embedContent ({ model : 'gemini-embedding-exp-03-07' , contents : 'What is the meaning of life?' , }); console . log ( response . embeddings ); } main (); Go package main import ( "context" "encoding/json" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := [] * genai . Content { genai . NewContentFromText ( "What is the meaning of life?" , genai . RoleUser ), } result , err := client . Models . EmbedContent ( ctx , "gemini-embedding-exp-03-07" , contents , nil , ) if err != nil { log . Fatal ( err ) } embeddings , err := json . MarshalIndent ( result . Embeddings , "" , " " ) if err != nil { log . Fatal ( err ) } fmt . Println ( string ( embeddings )) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-embedding-exp-03-07:embedContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{"model": "models/gemini-embedding-exp-03-07", "content": { "parts":[{ "text": "What is the meaning of life?"}]} }' You can also generate embeddings for multiple chunks at once by passing them in as a list of strings. Task types When building Retrieval Augmented Generation (RAG) systems, a common design is to use text embeddings to perform a similarity search. In some cases this can lead to degraded quality, because questions and their answers are not semantically similar. For example, a question like "Why is the sky blue?" and its answer "The scattering of sunlight causes the blue color," have distinctly different \ No newline at end of file diff --git a/docstore/b3990598-b00d-4ac8-b660-3f3fd1025ac2 b/docstore/b3990598-b00d-4ac8-b660-3f3fd1025ac2 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/b3990598-b00d-4ac8-b660-3f3fd1025ac2 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/b3a0382b-866a-43d2-9456-5d464fe3524c b/docstore/b3a0382b-866a-43d2-9456-5d464fe3524c new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/b3a0382b-866a-43d2-9456-5d464fe3524c @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/b3ab55d1-83d9-4bb0-9eff-7d6510adc353 b/docstore/b3ab55d1-83d9-4bb0-9eff-7d6510adc353 new file mode 100644 index 0000000000000000000000000000000000000000..6a0f5762f2e47222d475421a2613ce0f732fa260 --- /dev/null +++ b/docstore/b3ab55d1-83d9-4bb0-9eff-7d6510adc353 @@ -0,0 +1 @@ +in the Gemini API by setting clipping intervals or providing custom frame rate sampling. Tip: Video clipping and frames per second (FPS) are supported by all models, but the quality is significantly higher from 2.5 series models. Set clipping intervals You can clip video by specifying videoMetadata with start and end offsets. Python response = client . models . generate_content ( model = 'models/gemini-2.5-flash-preview-05-20' , contents = types . Content ( parts = [ types . Part ( file_data = types . FileData ( file_uri = 'https://www.youtube.com/watch?v=XEzRZ35urlk' ), video_metadata = types . VideoMetadata ( start_offset = '1250s' , end_offset = '1570s' ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) Set a custom frame rate You can set custom frame rate sampling by passing an fps argument to videoMetadata . Python # Only for videos of size <20Mb video_file_name = "/path/to/your/video.mp4" video_bytes = open ( video_file_name , 'rb' ) . read () response = client . models . generate_content ( model = 'models/gemini-2.5-flash-preview-05-20' , contents = types . Content ( parts = [ types . Part ( inline_data = types . Blob ( data = video_bytes , mime_type = 'video/mp4' ), video_metadata = types . VideoMetadata ( fps = 5 ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) By default 1 frame per second (FPS) is sampled from the video. You might want to set low FPS (< 1) for long videos. This is especially useful for mostly static videos (e.g. lectures). If you want to capture more details in rapidly changing visuals, consider setting a higher FPS value. Supported video formats Gemini supports the following video format MIME types: video/mp4 video/mpeg video/mov video/avi video/x-flv video/mpg video/webm video/wmv video/3gpp Technical details about videos Supported models & context : All Gemini 2.0 and 2.5 models can process video data. Models with a 2M context window can process videos up to 2 hours long at \ No newline at end of file diff --git a/docstore/b3b502fe-14bb-4d67-95b6-ab5295321237 b/docstore/b3b502fe-14bb-4d67-95b6-ab5295321237 new file mode 100644 index 0000000000000000000000000000000000000000..bb8a5c387274cea53762666893b97a549fe37a5c --- /dev/null +++ b/docstore/b3b502fe-14bb-4d67-95b6-ab5295321237 @@ -0,0 +1 @@ +Client () response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = "Provide a list of 3 famous physicists and their key contributions" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 1024 ) # Turn off thinking: # thinking_config=types.ThinkingConfig(thinking_budget=0) # Turn on dynamic thinking: # thinking_config=types.ThinkingConfig(thinking_budget=-1) ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : "Provide a list of 3 famous physicists and their key contributions" , config : { thinkingConfig : { thinkingBudget : 1024 , // Turn off thinking: // thinkingBudget: 0 // Turn on dynamic thinking: // thinkingBudget: -1 }, }, }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "google.golang.org/genai" "os" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } thinkingBudgetVal := int32 ( 1024 ) contents := genai . Text ( "Provide a list of 3 famous physicists and their key contributions" ) model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : & thinkingBudgetVal , // Turn off thinking: // ThinkingBudget: int32(0), // Turn on dynamic thinking: // ThinkingBudget: int32(-1), }, }) fmt . Println ( resp . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Provide a list of 3 famous physicists and their key contributions" } ] } ], "generationConfig": { \ No newline at end of file diff --git a/docstore/b3c3ea6b-4544-42d9-aed7-b4be4bcde1ad b/docstore/b3c3ea6b-4544-42d9-aed7-b4be4bcde1ad new file mode 100644 index 0000000000000000000000000000000000000000..045707d455060dfd20be0644c14272aa57ff277b --- /dev/null +++ b/docstore/b3c3ea6b-4544-42d9-aed7-b4be4bcde1ad @@ -0,0 +1 @@ +"log" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } prompt := "Explain the concept of Occam's Razor and provide a simple, everyday example." model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , genai . Text ( prompt ), nil ) fmt . Println ( resp . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain the concept of Occam\' s Razor and provide a simple, everyday example. " } ] } ] }' ``` Thinking budgets The thinkingBudget parameter guides the model on the number of thinking tokens to use when generating a response. A higher token count generally allows for more detailed reasoning, which can be beneficial for tackling more complex tasks . If latency is more important, use a lower budget or disable thinking by setting thinkingBudget to 0. Setting the thinkingBudget to -1 turns on dynamic thinking , meaning the model will adjust the budget based on the complexity of the request. The thinkingBudget is only supported in Gemini 2.5 Flash, 2.5 Pro, and 2.5 Flash-Lite. Depending on the prompt, the model might overflow or underflow the token budget. The following are thinkingBudget configuration details for each model type. Model Default setting (Thinking budget is not set) Range Disable thinking Turn on dynamic thinking 2.5 Pro Dynamic thinking: Model decides when and how much to think 128 to 32768 N/A: Cannot disable thinking thinkingBudget = -1 2.5 Flash Dynamic thinking: Model decides when and how much to think 0 to 24576 thinkingBudget = 0 thinkingBudget = -1 2.5 Flash Lite Model does not think 512 to 24576 thinkingBudget = 0 thinkingBudget = -1 Python from google import genai from google.genai import types client = genai . \ No newline at end of file diff --git a/docstore/b3ca87f4-43e8-40d5-a003-a28e5e4c64fb b/docstore/b3ca87f4-43e8-40d5-a003-a28e5e4c64fb new file mode 100644 index 0000000000000000000000000000000000000000..e83e80a217e9ff779ee42e75d11ac96b01a2f185 --- /dev/null +++ b/docstore/b3ca87f4-43e8-40d5-a003-a28e5e4c64fb @@ -0,0 +1 @@ +function_calling_config = types . FunctionCallingConfig ( mode = "ANY" , allowed_function_names = [ "get_current_temperature" ] ) ) # Create the generation config config = types . GenerateContentConfig ( tools = [ tools ], # not defined here. tool_config = tool_config , ) JavaScript import { FunctionCallingConfigMode } from '@google/genai' ; // Configure function calling mode const toolConfig = { functionCallingConfig : { mode : FunctionCallingConfigMode . ANY , allowedFunctionNames : [ 'get_current_temperature' ] } }; // Create the generation config const config = { tools : tools , // not defined here. toolConfig : toolConfig , }; Automatic function calling (Python only) When using the Python SDK, you can provide Python functions directly as tools. The SDK automatically converts the Python function to declarations, handles the function call execution and the response cycle for you. The Python SDK then automatically: Detects function call responses from the model. Call the corresponding Python function in your code. Sends the function response back to the model. Returns the model's final text response. To use this, define your function with type hints and a docstring, and then pass the function itself (not a JSON declaration) as a tool: Python from google import genai from google.genai import types # Define the function with type hints and docstring def get_current_temperature ( location : str ) - > dict : """Gets the current temperature for a given location. Args: location: The city and state, e.g. San Francisco, CA Returns: A dictionary containing the temperature and unit. """ # ... (implementation) ... return { "temperature" : 25 , "unit" : "Celsius" } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_current_temperature ] ) # Pass the function itself # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What's the temperature in Boston?" , config = config \ No newline at end of file diff --git a/docstore/b3cb629c-ff4a-4f76-8c01-a22e0ceacb8d b/docstore/b3cb629c-ff4a-4f76-8c01-a22e0ceacb8d new file mode 100644 index 0000000000000000000000000000000000000000..82539837fcb7adc353717f66580809eb160e30f0 --- /dev/null +++ b/docstore/b3cb629c-ff4a-4f76-8c01-a22e0ceacb8d @@ -0,0 +1 @@ +Google stock price?" , tools = 'google_search_retrieval' ) After Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'What is the Google stock price?' , config = types . GenerateContentConfig ( tools = [ types . Tool ( google_search = types . GoogleSearch () ) ] ) ) JSON response Generate answers in JSON format. Before Python By specifying a response_schema and setting response_mime_type="application/json" users can constrain the model to produce a JSON response following a given structure. import google.generativeai as genai import typing_extensions as typing class CountryInfo ( typing . TypedDict ): name : str population : int capital : str continent : str major_cities : list [ str ] gdp : int official_language : str total_area_sq_mi : int model = genai . GenerativeModel ( model_name = "gemini-1.5-flash" ) result = model . generate_content ( "Give me information of the United States" , generation_config = genai . GenerationConfig ( response_mime_type = "application/json" , response_schema = CountryInfo ), ) JavaScript import { GoogleGenerativeAI , SchemaType } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const schema = { description : "List of recipes" , type : SchemaType . ARRAY , items : { type : SchemaType . OBJECT , properties : { recipeName : { type : SchemaType . STRING , description : "Name of the recipe" , nullable : false , }, }, required : [ "recipeName" ], }, }; const model = genAI . getGenerativeModel ({ model : "gemini-1.5-pro" , generationConfig : { responseMimeType : "application/json" , responseSchema : schema , }, }); const result = await model . generateContent ( "List a few popular cookie recipes." , ); console . log ( result . response . text ()); After Python The new SDK uses pydantic classes to provide the schema (although you can pass a genai.types.Schema , or equivalent \ No newline at end of file diff --git a/docstore/b3da5427-0247-469a-bfc6-e56a86e72b4f b/docstore/b3da5427-0247-469a-bfc6-e56a86e72b4f new file mode 100644 index 0000000000000000000000000000000000000000..696658722d20a37964165e2b0fc6d9de79b0d2b8 --- /dev/null +++ b/docstore/b3da5427-0247-469a-bfc6-e56a86e72b4f @@ -0,0 +1 @@ +main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); You can enable transcription of the audio input by sending input_audio_transcription in setup config. Python import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ], "input_audio_transcription" : {}, } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_data = Path ( "16000.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_data , mime_type = 'audio/pcm;rate=16000' ) ) async for msg in session . receive (): if msg . server_content . input_transcription : print ( 'Transcript:' , msg . server_content . input_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ], inputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( \ No newline at end of file diff --git a/docstore/b3f1fcc1-26e0-491d-b1bb-b27a57c3ad1f b/docstore/b3f1fcc1-26e0-491d-b1bb-b27a57c3ad1f new file mode 100644 index 0000000000000000000000000000000000000000..f96be70c77f1fc155c8f8be6888201aed7e6b2ca --- /dev/null +++ b/docstore/b3f1fcc1-26e0-491d-b1bb-b27a57c3ad1f @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models#preview Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/b3fc866a-8eb4-4679-bddb-c83d47365e07 b/docstore/b3fc866a-8eb4-4679-bddb-c83d47365e07 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/b3fc866a-8eb4-4679-bddb-c83d47365e07 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/b404521d-ba1b-4cf2-803f-458425324f29 b/docstore/b404521d-ba1b-4cf2-803f-458425324f29 new file mode 100644 index 0000000000000000000000000000000000000000..e7d90d05bdba8c7db744a9be77541e0c0d9a3b49 --- /dev/null +++ b/docstore/b404521d-ba1b-4cf2-803f-458425324f29 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#main-content Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/b4122978-2fce-4659-a367-c14b49791459 b/docstore/b4122978-2fce-4659-a367-c14b49791459 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/b4122978-2fce-4659-a367-c14b49791459 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/b4192731-7f74-44a2-8854-76c1ea7d1adf b/docstore/b4192731-7f74-44a2-8854-76c1ea7d1adf new file mode 100644 index 0000000000000000000000000000000000000000..1b590529498df147a73d1ae4a22f439d08a36cb6 --- /dev/null +++ b/docstore/b4192731-7f74-44a2-8854-76c1ea7d1adf @@ -0,0 +1 @@ +from "node:fs" ; const ai = new GoogleGenAI ({}); const base64ImageFile = fs . readFileSync ( "path/to/small-sample.jpg" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "image/jpeg" , data : base64ImageFile , }, }, { text : "Caption this image." }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go bytes , _ := os . ReadFile ( "path/to/small-sample.jpg" ) parts := [] * genai . Part { genai . NewPartFromBytes ( bytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST IMG_PATH = "/path/to/your/image1.jpg" if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "' " $( base64 $B64FLAGS $IMG_PATH ) " '" } }, {"text": "Caption this image."}, ] }] }' 2 > /dev/null You can also fetch an image from a URL, convert it to bytes, and pass it to generateContent as shown in the following examples. Python from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests . get ( image_path ) . content image = types . Part . from_bytes ( data = image_bytes , mime_type = "image/jpeg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is this image?" , image ], ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; \ No newline at end of file diff --git a/docstore/b4216dd1-4f43-40dd-8802-24c980c4ec49 b/docstore/b4216dd1-4f43-40dd-8802-24c980c4ec49 new file mode 100644 index 0000000000000000000000000000000000000000..5a2b8a2c7253b87796a50aae4616794fa40b7018 --- /dev/null +++ b/docstore/b4216dd1-4f43-40dd-8802-24c980c4ec49 @@ -0,0 +1 @@ +( 'Opened' ); }, onmessage : function ( message ) { console . debug ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send content... session . close (); } main (); Note: You can only set one modality in the response_modalities field. This means that you can configure the model to respond with either text or audio, but not both in the same session. Interaction modalities The following sections provide examples and supporting context for the different input and output modalities available in Live API. Sending and receiving text Here's how you can send and receive text: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello, how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . text is not None : print ( response . text , end = "" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . \ No newline at end of file diff --git a/docstore/b44ebe8f-4f67-475a-8415-7f6e3762c7da b/docstore/b44ebe8f-4f67-475a-8415-7f6e3762c7da new file mode 100644 index 0000000000000000000000000000000000000000..ec6cba9f5d0ceb3b74c56797939372d30da827c9 --- /dev/null +++ b/docstore/b44ebe8f-4f67-475a-8415-7f6e3762c7da @@ -0,0 +1 @@ += "gemini-2.0-flash" , contents = """Generate a short transcript around 100 words that reads like it was clipped from a podcast by excited herpetologists. The hosts names are Dr. Anya and Liam.""" ) . text response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = transcript , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( multi_speaker_voice_config = types . MultiSpeakerVoiceConfig ( speaker_voice_configs = [ types . SpeakerVoiceConfig ( speaker = 'Dr. Anya' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), types . SpeakerVoiceConfig ( speaker = 'Liam' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Puck' , ) ) ), ] ) ) ) ) # ...Code to stream or save the output JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const transcript = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Generate a short transcript around 100 words that reads like it was clipped from a podcast by excited herpetologists. The hosts names are Dr. Anya and Liam." , }) const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : transcript , config : { responseModalities : [ 'AUDIO' ], speechConfig : { multiSpeakerVoiceConfig : { speakerVoiceConfigs : [ { speaker : "Dr. Anya" , voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" }, } }, { speaker : "Liam" , voiceConfig : { prebuiltVoiceConfig : { voiceName : "Puck" }, } } ] } } } }); } // ..JavaScript code for exporting .wav file for output audio await main (); Voice options TTS models support the following 30 voice options in the voice_name field: Zephyr -- Bright Puck -- Upbeat Charon -- Informative Kore -- Firm Fenrir -- Excitable Leda -- Youthful Orus -- Firm \ No newline at end of file diff --git a/docstore/b47cc559-e1da-4edb-9d72-3c0d4f06457c b/docstore/b47cc559-e1da-4edb-9d72-3c0d4f06457c new file mode 100644 index 0000000000000000000000000000000000000000..eb9021b0c62d8225572ffa88a8581263b0a8ad78 --- /dev/null +++ b/docstore/b47cc559-e1da-4edb-9d72-3c0d4f06457c @@ -0,0 +1 @@ +and their capabilities, visit the Models page. Best practices Prompting tips For basic text generation, a zero-shot prompt often suffices without needing examples, system instructions or specific formatting. For more tailored outputs: Use System instructions to guide the model. Provide few example inputs and outputs to guide the model. This is often referred to as few-shot prompting. Consult our prompt engineering guide for more tips. Structured output In some cases, you may need structured output, such as JSON. Refer to our structured output guide to learn how. What's next Try the Gemini API getting started Colab . Explore Gemini's image , video , audio and document understanding capabilities. Learn about multimodal file prompting strategies . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/b499bde5-d8aa-4c3a-98f8-c351837d02d5 b/docstore/b499bde5-d8aa-4c3a-98f8-c351837d02d5 new file mode 100644 index 0000000000000000000000000000000000000000..0d39ec227c9e73c3d3b5117917b80f593b500b4b --- /dev/null +++ b/docstore/b499bde5-d8aa-4c3a-98f8-c351837d02d5 @@ -0,0 +1 @@ +async function main () { const ai = new GoogleGenAI ({}); const imageUrl = "https://goo.gle/instrument-img" ; const response = await fetch ( imageUrl ); const imageArrayBuffer = await response . arrayBuffer (); const base64ImageData = Buffer . from ( imageArrayBuffer ). toString ( 'base64' ); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ { inlineData : { mimeType : 'image/jpeg' , data : base64ImageData , }, }, { text : "Caption this image." } ], }); console . log ( result . text ); } main (); Go package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Download the image. imageResp , _ := http . Get ( "https://goo.gle/instrument-img" ) imageBytes , _ := io . ReadAll ( imageResp . Body ) parts := [] * genai . Part { genai . NewPartFromBytes ( imageBytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMG_URL = "https://goo.gle/instrument-img" MIME_TYPE = $( curl -sIL " $IMG_URL " | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1 ) if [[ -z " $MIME_TYPE " || ! " $MIME_TYPE " == image/* ]] ; then MIME_TYPE = "image/jpeg" fi # Check for macOS if [[ " $( uname ) " == "Darwin" ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -b 0 ) elif [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 ) else IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -w0 ) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ \ No newline at end of file diff --git a/docstore/b4acca95-46d8-426f-9d0f-26c2116d8a94 b/docstore/b4acca95-46d8-426f-9d0f-26c2116d8a94 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/b4acca95-46d8-426f-9d0f-26c2116d8a94 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/b4af5818-6eb5-4f93-bc4b-8dfb8f73e6cf b/docstore/b4af5818-6eb5-4f93-bc4b-8dfb8f73e6cf new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/b4af5818-6eb5-4f93-bc4b-8dfb8f73e6cf @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/b4ef4b8c-e071-4a0f-9836-2bb78e2f88e1 b/docstore/b4ef4b8c-e071-4a0f-9836-2bb78e2f88e1 new file mode 100644 index 0000000000000000000000000000000000000000..aef01da97801860cabcd3fb68af1ef57ccf11af0 --- /dev/null +++ b/docstore/b4ef4b8c-e071-4a0f-9836-2bb78e2f88e1 @@ -0,0 +1 @@ +Speech generation (text-to-speech) | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Speech generation (text-to-speech) The Gemini API can transform text input into single speaker or multi-speaker audio using native text-to-speech (TTS) generation capabilities. Text-to-speech (TTS) generation is controllable , meaning you can use natural language to structure interactions and guide the style , accent , pace , and tone of the audio. The TTS capability differs from speech generation provided through the Live API , which is designed for interactive, unstructured audio, and multimodal inputs and outputs. While the Live API excels in dynamic conversational contexts, TTS through the Gemini API is tailored for scenarios that require exact text recitation with fine-grained control over style and sound, such as podcast or audiobook generation. This guide shows you how to generate single-speaker and multi-speaker audio from text. Preview: Native text-to-speech (TTS) is in Preview . Before you begin Ensure you use a Gemini 2.5 model variant with native text-to-speech (TTS) capabilities, as listed in the Supported models section. For optimal results, consider which model best fits your specific use case. You may find it useful to test the Gemini 2.5 TTS models in AI Studio before you start building. Note: TTS models accept text-only inputs and produce audio-only outputs. For a complete list of restrictions specific to TTS models, review the Limitations section. Single-speaker text-to-speech To convert text to single-speaker audio, set the response modality to "audio", and pass a SpeechConfig object with VoiceConfig set. You'll need to choose a \ No newline at end of file diff --git a/docstore/b51099fa-8e3f-495f-b44c-b28d721eb900 b/docstore/b51099fa-8e3f-495f-b44c-b28d721eb900 new file mode 100644 index 0000000000000000000000000000000000000000..989a38b805ed3662f352ccf72b45824dd12e3417 --- /dev/null +++ b/docstore/b51099fa-8e3f-495f-b44c-b28d721eb900 @@ -0,0 +1 @@ +Tool use with Live API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Tool use with Live API Tool use allows Live API to go beyond just conversation by enabling it to perform actions in the real-world and pull in external context while maintaining a real time connection. You can define tools such as Function calling , Code execution , and Google Search with the Live API. Overview of supported tools Here's a brief overview of the available tools for each model: Tool Cascaded models gemini-live-2.5-flash-preview gemini-2.0-flash-live-001 gemini-2.5-flash-preview-native-audio-dialog gemini-2.5-flash-exp-native-audio-thinking-dialog Search Yes Yes Yes Function calling Yes Yes No Code execution Yes No No Url context Yes No No Function calling Live API supports function calling, just like regular content generation requests. Function calling lets the Live API interact with external data and programs, greatly increasing what your applications can accomplish. You can define function declarations as part of the session configuration. After receiving tool calls, the client should respond with a list of FunctionResponse objects using the session.send_tool_response method. See the Function calling tutorial to learn more. Note: Unlike the generateContent API, the Live API doesn't support automatic tool response handling. You must handle tool responses manually in your client code. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" # Simple function definitions turn_on_the_lights = { "name" : "turn_on_the_lights" } turn_off_the_lights = { "name" : \ No newline at end of file diff --git a/docstore/b510f4ef-d7c7-44b8-94f8-2a98e0627a85 b/docstore/b510f4ef-d7c7-44b8-94f8-2a98e0627a85 new file mode 100644 index 0000000000000000000000000000000000000000..8562c6ca5d2a89dac90935227121a5fd486f1f09 --- /dev/null +++ b/docstore/b510f4ef-d7c7-44b8-94f8-2a98e0627a85 @@ -0,0 +1 @@ +establishing your core idea, and then refine and expand upon that core idea until the generated image is close to your vision. Prompt: A park in the spring next to a lake Prompt: A park in the spring next to a lake, the sun sets across the lake, golden hour Prompt: A park in the spring next to a lake, the sun sets across the lake, golden hour, red wildflowers Imagen models can transform your ideas into detailed images, whether your prompts are short or long and detailed. Refine your vision through iterative prompting, adding details until you achieve the perfect result. Short prompts let you generate an image quickly. Prompt: close-up photo of a woman in her 20s, street photography, movie still, muted orange warm tones Longer prompts let you add specific details and build your image. Prompt: captivating photo of a woman in her 20s utilizing a street photography style. The image should look like a movie still with muted orange warm tones. Additional advice for Imagen prompt writing: Use descriptive language : Employ detailed adjectives and adverbs to paint a clear picture for Imagen. Provide context : If necessary, include background information to aid the AI's understanding. Reference specific artists or styles : If you have a particular aesthetic in mind, referencing specific artists or art movements can be helpful. Use prompt engineering tools : Consider exploring prompt engineering tools or resources to help you refine your prompts and achieve optimal results. Enhancing the facial details in your personal and group images : Specify facial details as a focus of the photo (for example, use the word "portrait" in the prompt). Generate text in images Imagen models can add text into images, opening up more creative image generation possibilities. Use the following guidance to get the most out of this feature: Iterate with confidence : You might have to regenerate images until you achieve the look you want. Imagen's text integration is still evolving, and sometimes \ No newline at end of file diff --git a/docstore/b53f410d-2e78-4794-bd4c-1b70bd85743e b/docstore/b53f410d-2e78-4794-bd4c-1b70bd85743e new file mode 100644 index 0000000000000000000000000000000000000000..a8bbf59ddedb6c7213df1bd811fe6d2ce8316cfa --- /dev/null +++ b/docstore/b53f410d-2e78-4794-bd4c-1b70bd85743e @@ -0,0 +1 @@ +. live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-exp-native-audio-thinking-dialog' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Voice Activity Detection (VAD) Voice Activity Detection (VAD) allows the model to recognize when a person is speaking. This is essential for creating natural conversations, as it allows a user to interrupt the model at any time. When VAD detects an interruption, the ongoing generation is canceled and discarded. Only the information already sent to the client is retained in the session history. The server then sends a BidiGenerateContentServerContent message to report the interruption. The Gemini server then discards any pending function calls and sends a BidiGenerateContentServerContent message with the IDs of the canceled calls. Python async for response in session . receive (): if response . server_content . interrupted is True : # The generation was interrupted # If realtime playback is implemented in your application, # you should stop playing audio and clear queued playback here. JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . interrupted ) { // The generation was interrupted // If realtime playback is implemented in your application, // you should stop playing audio and clear queued playback here. } } Automatic VAD By default, the model automatically performs VAD on a continuous audio input stream. VAD can be configured with the realtimeInputConfig.automaticActivityDetection field of the setup configuration . When the audio stream is paused for more than a second (for example, because the user switched off the microphone), an audioStreamEnd event \ No newline at end of file diff --git a/docstore/b54582c8-da44-46a5-a69b-bef0ba6affb2 b/docstore/b54582c8-da44-46a5-a69b-bef0ba6affb2 new file mode 100644 index 0000000000000000000000000000000000000000..90311e83465fc930174e1787a7822757d628fc0a --- /dev/null +++ b/docstore/b54582c8-da44-46a5-a69b-bef0ba6affb2 @@ -0,0 +1 @@ +setLightValues ( tool_call . args . brightness , tool_call . args . color_temp ); console . log ( `Function execution result: ${ JSON . stringify ( result ) } ` ); } Step 4: Create user friendly response with function result and call the model again Finally, send the result of the function execution back to the model so it can incorporate this information into its final response to the user. Python # Create a function response part function_response_part = types . Part . from_function_response ( name = tool_call . name , response = { "result" : result }, ) # Append function call and result of the function execution to contents contents . append ( response . candidates [ 0 ] . content ) # Append the content from the model's response. contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Create a function response part const function_response_part = { name : tool_call . name , response : { result } } // Append function call and result of the function execution to contents contents . push ( response . candidates [ 0 ]. content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); // Get the final response from the model const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); This completes the function calling flow. The model successfully used the set_light_values function to perform the request action of the user. Function declarations When you implement function calling in a prompt, you create a tools object, which contains one or more function declarations . You define functions using JSON, specifically with a select subset of the OpenAPI schema format. A single function \ No newline at end of file diff --git a/docstore/b547fe7d-57e9-46eb-a34e-9e5886d6c57c b/docstore/b547fe7d-57e9-46eb-a34e-9e5886d6c57c new file mode 100644 index 0000000000000000000000000000000000000000..eb9021b0c62d8225572ffa88a8581263b0a8ad78 --- /dev/null +++ b/docstore/b547fe7d-57e9-46eb-a34e-9e5886d6c57c @@ -0,0 +1 @@ +and their capabilities, visit the Models page. Best practices Prompting tips For basic text generation, a zero-shot prompt often suffices without needing examples, system instructions or specific formatting. For more tailored outputs: Use System instructions to guide the model. Provide few example inputs and outputs to guide the model. This is often referred to as few-shot prompting. Consult our prompt engineering guide for more tips. Structured output In some cases, you may need structured output, such as JSON. Refer to our structured output guide to learn how. What's next Try the Gemini API getting started Colab . Explore Gemini's image , video , audio and document understanding capabilities. Learn about multimodal file prompting strategies . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/b54b5051-7427-4751-b6d5-f5b02146dcbd b/docstore/b54b5051-7427-4751-b6d5-f5b02146dcbd new file mode 100644 index 0000000000000000000000000000000000000000..ebc8fdc5ad27fd96758924c177eadfccc4d6556f --- /dev/null +++ b/docstore/b54b5051-7427-4751-b6d5-f5b02146dcbd @@ -0,0 +1 @@ +Structured output | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Structured output You can configure Gemini for structured output instead of unstructured text, allowing precise extraction and standardization of information for further processing. For example, you can use structured output to extract information from resumes, standardize them to build a structured database. Gemini can generate either JSON or enum values as structured output. Generating JSON There are two ways to generate JSON using the Gemini API: Configure a schema on the model Provide a schema in a text prompt Configuring a schema on the model is the recommended way to generate JSON, because it constrains the model to output JSON. Configuring a schema (recommended) To constrain the model to generate JSON, configure a responseSchema . The model will then respond to any prompt with JSON-formatted output. Python from google import genai from pydantic import BaseModel class Recipe ( BaseModel ): recipe_name : str ingredients : list [ str ] client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "List a few popular cookie recipes, and include the amounts of ingredients." , config = { "response_mime_type" : "application/json" , "response_schema" : list [ Recipe ], }, ) # Use the response as a JSON string. print ( response . text ) # Use instantiated objects. my_recipes : list [ Recipe ] = response . parsed Note: Pydantic validators are not yet supported. If a pydantic.ValidationError occurs, it is suppressed, and .parsed may be empty/null. JavaScript import { GoogleGenAI , Type } from "@google/genai" ; const ai = \ No newline at end of file diff --git a/docstore/b5603a92-7824-4728-b5a9-878395ec89cc b/docstore/b5603a92-7824-4728-b5a9-878395ec89cc new file mode 100644 index 0000000000000000000000000000000000000000..ea67dfb950f5bf1f1ddc040da181eb1accb5974b --- /dev/null +++ b/docstore/b5603a92-7824-4728-b5a9-878395ec89cc @@ -0,0 +1 @@ +breakdown:" ) for detail in usage . response_tokens_details : match detail : case types . ModalityTokenCount ( modality = modality , token_count = count ): print ( f " { modality } : { count } " ) JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . usageMetadata ) { console . debug ( 'Used %s tokens in total. Response token breakdown:\n' , turn . usageMetadata . totalTokenCount ); for ( const detail of turn . usageMetadata . responseTokensDetails ) { console . debug ( '%s\n' , detail ); } } } Media resolution You can specify the media resolution for the input media by setting the mediaResolution field as part of the session configuration: Python from google.genai import types config = { "response_modalities" : [ "AUDIO" ], "media_resolution" : types . MediaResolution . MEDIA_RESOLUTION_LOW , } JavaScript import { GoogleGenAI , Modality , MediaResolution } from '@google/genai' ; const config = { responseModalities : [ Modality . TEXT ], mediaResolution : MediaResolution . MEDIA_RESOLUTION_LOW , }; Limitations Consider the following limitations of the Live API when you plan your project. Response modalities You can only set one response modality ( TEXT or AUDIO ) per session in the session configuration. Setting both results in a config error message. This means that you can configure the model to respond with either text or audio, but not both in the same session. Client authentication The Live API only provides server-to-server authentication by default. If you're implementing your Live API application using a client-to-server approach , you need to use ephemeral tokens to mitigate security risks. Session duration Audio-only sessions are limited to 15 minutes, and audio plus video sessions are limited to 2 minutes. However, you can configure different session management techniques for unlimited extensions on session duration. Context window A session has a context window limit of: 128k tokens for native audio output models 32k \ No newline at end of file diff --git a/docstore/b58ea4e4-fb41-44a8-8dd3-a7f9e188d6c8 b/docstore/b58ea4e4-fb41-44a8-8dd3-a7f9e188d6c8 new file mode 100644 index 0000000000000000000000000000000000000000..e83e80a217e9ff779ee42e75d11ac96b01a2f185 --- /dev/null +++ b/docstore/b58ea4e4-fb41-44a8-8dd3-a7f9e188d6c8 @@ -0,0 +1 @@ +function_calling_config = types . FunctionCallingConfig ( mode = "ANY" , allowed_function_names = [ "get_current_temperature" ] ) ) # Create the generation config config = types . GenerateContentConfig ( tools = [ tools ], # not defined here. tool_config = tool_config , ) JavaScript import { FunctionCallingConfigMode } from '@google/genai' ; // Configure function calling mode const toolConfig = { functionCallingConfig : { mode : FunctionCallingConfigMode . ANY , allowedFunctionNames : [ 'get_current_temperature' ] } }; // Create the generation config const config = { tools : tools , // not defined here. toolConfig : toolConfig , }; Automatic function calling (Python only) When using the Python SDK, you can provide Python functions directly as tools. The SDK automatically converts the Python function to declarations, handles the function call execution and the response cycle for you. The Python SDK then automatically: Detects function call responses from the model. Call the corresponding Python function in your code. Sends the function response back to the model. Returns the model's final text response. To use this, define your function with type hints and a docstring, and then pass the function itself (not a JSON declaration) as a tool: Python from google import genai from google.genai import types # Define the function with type hints and docstring def get_current_temperature ( location : str ) - > dict : """Gets the current temperature for a given location. Args: location: The city and state, e.g. San Francisco, CA Returns: A dictionary containing the temperature and unit. """ # ... (implementation) ... return { "temperature" : 25 , "unit" : "Celsius" } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_current_temperature ] ) # Pass the function itself # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What's the temperature in Boston?" , config = config \ No newline at end of file diff --git a/docstore/b59484b8-4eac-4cc3-ac1a-da7f6dcfa6e0 b/docstore/b59484b8-4eac-4cc3-ac1a-da7f6dcfa6e0 new file mode 100644 index 0000000000000000000000000000000000000000..1d2463b6c11af951d3bab4a46bb6e7601785f7d8 --- /dev/null +++ b/docstore/b59484b8-4eac-4cc3-ac1a-da7f6dcfa6e0 @@ -0,0 +1 @@ +'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works in a few words" } ] } ] }' Meet the models Use Gemini in Google AI Studio 2.5 Pro spark Our most powerful thinking model with features for complex reasoning and much more 2.5 Flash spark Our newest multimodal model, with next generation features and improved capabilities 2.5 Flash-Lite spark Our fastest and most cost-efficient multimodal model with great performance for high-frequency tasks Explore the API Native Image Generation Generate and edit highly contextual images natively with Gemini 2.0 Flash. Explore long context Input millions of tokens to Gemini models and derive understanding from unstructured images, videos, and documents. Generate structured outputs Constrain Gemini to respond with JSON, a structured data format suitable for automated processing. Start building with the Gemini API Get started Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/b5c1e222-ecb2-463f-ab99-0721d0643c9a b/docstore/b5c1e222-ecb2-463f-ab99-0721d0643c9a new file mode 100644 index 0000000000000000000000000000000000000000..3c1b7a7f28e24faa230d65b7608a5733d7c64407 --- /dev/null +++ b/docstore/b5c1e222-ecb2-463f-ab99-0721d0643c9a @@ -0,0 +1 @@ +single prompt by including multiple image Part objects in the contents array. These can be a mix of inline data (local files or URLs) and File API references. Python from google import genai from google.genai import types client = genai . Client () # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client . files . upload ( file = image1_path ) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open ( image2_path , 'rb' ) as f : img2_bytes = f . read () # Create the prompt with text and multiple images response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is different between these two images?" , uploaded_file , # Use the uploaded file reference types . Part . from_bytes ( data = img2_bytes , mime_type = 'image/png' ) ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); async function main () { // Upload the first image const image1_path = "path/to/image1.jpg" ; const uploadedFile = await ai . files . upload ({ file : image1_path , config : { mimeType : "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png" ; const base64Image2File = fs . readFileSync ( image2_path , { encoding : "base64" , }); // Create the prompt with text and multiple images const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ "What is different between these two images?" , createPartFromUri ( uploadedFile . uri , uploadedFile . mimeType ), { inlineData : { mimeType : "image/png" , data : base64Image2File , }, }, ]), }); console . log ( response . text ); } await main (); Go // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile , _ := client . Files . UploadFromPath ( ctx , image1Path , nil ) // Prepare the second image as inline \ No newline at end of file diff --git a/docstore/b5f2f925-5ca6-4a7d-9d39-4d36a27aac99 b/docstore/b5f2f925-5ca6-4a7d-9d39-4d36a27aac99 new file mode 100644 index 0000000000000000000000000000000000000000..45d17d6ac3f5b7951c085e76c40e76fbe5fe62ea --- /dev/null +++ b/docstore/b5f2f925-5ca6-4a7d-9d39-4d36a27aac99 @@ -0,0 +1 @@ +"thinkingConfig": { "thinkingBudget": 1024 # Thinking off: # "thinkingBudget": 0 # Turn on dynamic thinking: # "thinkingBudget": -1 } } }' Thought summaries Thought summaries are synthesized versions of the model's raw thoughts and offer insights into the model's internal reasoning process. Note that thinking budgets apply to the model's raw thoughts and not to thought summaries. You can enable thought summaries by setting includeThoughts to true in your request configuration. You can then access the summary by iterating through the response parameter's parts , and checking the thought boolean. Here's an example demonstrating how to enable and retrieve thought summaries without streaming, which returns a single, final thought summary with the response: Python from google import genai from google.genai import types client = genai . Client () prompt = "What is the sum of the first 50 prime numbers?" response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = prompt , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( include_thoughts = True ) ) ) for part in response . candidates [ 0 ] . content . parts : if not part . text : continue if part . thought : print ( "Thought summary:" ) print ( part . text ) print () else : print ( "Answer:" ) print ( part . text ) print () JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : "What is the sum of the first 50 prime numbers?" , config : { thinkingConfig : { includeThoughts : true , }, }, }); for ( const part of response . candidates [ 0 ]. content . parts ) { if ( ! part . text ) { continue ; } else if ( part . thought ) { console . log ( "Thoughts summary:" ); console . log ( part . text ); } else { console . log ( "Answer:" ); console . log ( part . text ); } } } main (); Go package main import ( "context" \ No newline at end of file diff --git a/docstore/b60f3d07-a1a2-4be4-9571-a549a22079cf b/docstore/b60f3d07-a1a2-4be4-9571-a549a22079cf new file mode 100644 index 0000000000000000000000000000000000000000..9bdceff10febb2c20e655c9c51c8df9e3aef858d --- /dev/null +++ b/docstore/b60f3d07-a1a2-4be4-9571-a549a22079cf @@ -0,0 +1 @@ +Using different words or phrasing in your prompts often yields different responses from the model even though they all mean the same thing. If you're not getting the expected results from your prompt, try rephrasing it. Version 1: How do I bake a pie? Version 2: Suggest a recipe for a pie. Version 3: What's a good pie recipe? Switch to an analogous task: If you can't get the model to follow your instructions for a task, try giving it instructions for an analogous task that achieves the same result. This prompt tells the model to categorize a book by using predefined categories: Prompt: Which category does The Odyssey belong to: thriller sci-fi mythology biography Response: The Odyssey belongs to the category of **mythology**. Here's why: * **Mythology:** The Odyssey tells the story of Odysseus, a hero from Greek mythology, and his journey home after the Trojan War. It features gods, monsters, and supernatural events common to Greek mythology. ..... (gemini-2.5-flash) The response is correct, but the model didn't stay within the bounds of the options. You also want to model to just respond with one of the options instead of in a full sentence. In this case, you can rephrase the instructions as a multiple choice question and ask the model to choose an option. Prompt: Multiple choice problem: Which of the following options describes the book The Odyssey? Options: thriller sci-fi mythology biography Response: The correct answer is mythology . (gemini-2.5-flash) Change the order of prompt content: The order of the content in the prompt can sometimes affect the response. Try changing the content order and see how that affects the response. Version 1 : [ examples ] [ context ] [ input ] Version 2 : [ input ] [ examples ] [ context ] Version 3 : [ examples ] [ input ] [ context ] Fallback responses A fallback response is a response returned by the model when either the prompt or the response triggers a safety filter. An example of a fallback response is "I'm not able to \ No newline at end of file diff --git a/docstore/b62eddf6-fbe6-4c5f-b74f-45bc02c089c1 b/docstore/b62eddf6-fbe6-4c5f-b74f-45bc02c089c1 new file mode 100644 index 0000000000000000000000000000000000000000..13654164ec6e7048af241ec2efbc514257c312b5 --- /dev/null +++ b/docstore/b62eddf6-fbe6-4c5f-b74f-45bc02c089c1 @@ -0,0 +1 @@ +For alternative methods of providing images and more advanced image processing, see our image understanding guide . The API also supports document , video , and audio inputs and understanding. Streaming responses By default, the model returns a response only after the entire generation process is complete. For more fluid interactions, use streaming to receive GenerateContentResponse instances incrementally as they're generated. Python from google import genai client = genai . Client () response = client . models . generate_content_stream ( model = "gemini-2.5-flash" , contents = [ "Explain how AI works" ] ) for chunk in response : print ( chunk . text , end = "" ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , }); for await ( const chunk of response ) { console . log ( chunk . text ); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } stream := client . Models . GenerateContentStream ( ctx , "gemini-2.5-flash" , genai . Text ( "Write a story about a magic backpack." ), nil , ) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt=sse" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ --no-buffer \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main \ No newline at end of file diff --git a/docstore/b64ffe20-53e1-434d-8122-cc91be76e0da b/docstore/b64ffe20-53e1-434d-8122-cc91be76e0da new file mode 100644 index 0000000000000000000000000000000000000000..1426f6277d87da029e324e49b5a4fcb88dde544c --- /dev/null +++ b/docstore/b64ffe20-53e1-434d-8122-cc91be76e0da @@ -0,0 +1 @@ +live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Receiving a message before the session disconnects The server sends a GoAway message that signals that the current connection will soon be terminated. This message includes the timeLeft , indicating the remaining time and lets you take further action before the connection will be terminated as ABORTED. Python async for response in session . receive (): if response . go_away is not None : # The connection will soon be terminated print ( response . go_away . time_left ) JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . goAway ) { console . debug ( 'Time left: %s\n' , turn . goAway . timeLeft ); } } Receiving a message when the generation is complete The server sends a generationComplete message that signals that the model finished generating the response. Python async for response in session . receive (): if response . server_content . generation_complete is True : # The generation is complete JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . generationComplete ) { // The generation is complete } } What's next Explore more ways to work with the Live API in the full Capabilities guide, the Tool use page, or the Live API cookbook . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/b6723bdd-ba93-4922-b2fc-5ec02c36c231 b/docstore/b6723bdd-ba93-4922-b2fc-5ec02c36c231 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/b6723bdd-ba93-4922-b2fc-5ec02c36c231 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/b6757cec-9471-4d56-8dd5-e4f997dd6c9d b/docstore/b6757cec-9471-4d56-8dd5-e4f997dd6c9d new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/b6757cec-9471-4d56-8dd5-e4f997dd6c9d @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/b677b191-bca7-4ab1-b0df-2128b44cb38a b/docstore/b677b191-bca7-4ab1-b0df-2128b44cb38a new file mode 100644 index 0000000000000000000000000000000000000000..2dd19cc1c2d77863f4a301d62456871308dd74fd --- /dev/null +++ b/docstore/b677b191-bca7-4ab1-b0df-2128b44cb38a @@ -0,0 +1 @@ +"temperature" ], }, }, ], }, ]; // Prompt for the model let contents = [ { role : "user" , parts : [ { text : "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , }, ], }, ]; // Loop until the model has no more function calls to make while ( true ) { const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents , config : { tools }, }); if ( result . functionCalls && result . functionCalls . length > 0 ) { const functionCall = result . functionCalls [ 0 ]; const { name , args } = functionCall ; if ( ! toolFunctions [ name ]) { throw new Error ( `Unknown function call: ${ name } ` ); } // Call the function and get the response. const toolResponse = toolFunctions [ name ]( args ); const functionResponsePart = { name : functionCall . name , response : { result : toolResponse , }, }; // Send the function response back to the model. contents . push ({ role : "model" , parts : [ { functionCall : functionCall , }, ], }); contents . push ({ role : "user" , parts : [ { functionResponse : functionResponsePart , }, ], }); } else { // No more function calls, break the loop. console . log ( result . text ); break ; } } Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . It 's 25°C in London, so I' ve set the thermostat to 20 ° C . Compositional function calling is a native Live API feature. This means Live API can handle the function calling similar to the Python SDK. Python # Light control schemas turn_on_the_lights_schema = { 'name' : 'turn_on_the_lights' } \ No newline at end of file diff --git a/docstore/b68b77c6-2fb4-4035-8381-77fbdd225f1e b/docstore/b68b77c6-2fb4-4035-8381-77fbdd225f1e new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/b68b77c6-2fb4-4035-8381-77fbdd225f1e @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/b68c6d48-b07f-436e-a381-c99f05fac83f b/docstore/b68c6d48-b07f-436e-a381-c99f05fac83f new file mode 100644 index 0000000000000000000000000000000000000000..c6b31d98c2b16f02ca76b6157fd854c0b53727af --- /dev/null +++ b/docstore/b68c6d48-b07f-436e-a381-c99f05fac83f @@ -0,0 +1 @@ +: [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Thinking with Gemini 2.5 2.5 Flash and Pro models have "thinking" enabled by default to enhance quality, which may take longer to run and increase token usage. When using 2.5 Flash, you can disable thinking by setting the thinking budget to zero. For more details, see the thinking guide . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # Disables thinking ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , config : { thinkingConfig : { thinkingBudget : 0 , // Disables thinking }, } }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "How does AI work?" ), & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : int32 ( 0 ), // Disables thinking }, } ) \ No newline at end of file diff --git a/docstore/b6b89245-c46c-4b13-9c03-3527a99ea055 b/docstore/b6b89245-c46c-4b13-9c03-3527a99ea055 new file mode 100644 index 0000000000000000000000000000000000000000..b0d24ed8267a7db2d3f856003571a245204928ff --- /dev/null +++ b/docstore/b6b89245-c46c-4b13-9c03-3527a99ea055 @@ -0,0 +1 @@ +voice name from the prebuilt output voices . This example saves the output audio from the model in a wave file: Python from google import genai from google.genai import types import wave # Set up the wave file to save the output: def wave_file ( filename , pcm , channels = 1 , rate = 24000 , sample_width = 2 ): with wave . open ( filename , "wb" ) as wf : wf . setnchannels ( channels ) wf . setsampwidth ( sample_width ) wf . setframerate ( rate ) wf . writeframes ( pcm ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = "Say cheerfully: Have a wonderful day!" , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), ) ) data = response . candidates [ 0 ] . content . parts [ 0 ] . inline_data . data file_name = 'out.wav' wave_file ( file_name , data ) # Saves the file to current directory For more code samples, refer to the "TTS - Get Started" file in the cookbooks repository: View on GitHub JavaScript import { GoogleGenAI } from '@google/genai' ; import wav from 'wav' ; async function saveWaveFile ( filename , pcmData , channels = 1 , rate = 24000 , sampleWidth = 2 , ) { return new Promise (( resolve , reject ) = > { const writer = new wav . FileWriter ( filename , { channels , sampleRate : rate , bitDepth : sampleWidth * 8 , }); writer . on ( 'finish' , resolve ); writer . on ( 'error' , reject ); writer . write ( pcmData ); writer . end (); }); } async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : [{ parts : [{ text : 'Say cheerfully: Have a wonderful day!' }] }], config : { responseModalities : [ 'AUDIO' ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Kore' }, }, }, }, }); \ No newline at end of file diff --git a/docstore/b6e076c5-667c-4e91-b319-e5e159c9a29f b/docstore/b6e076c5-667c-4e91-b319-e5e159c9a29f new file mode 100644 index 0000000000000000000000000000000000000000..5517c0bf6b0252fb1080acbdd22b2b409d2663d8 --- /dev/null +++ b/docstore/b6e076c5-667c-4e91-b319-e5e159c9a29f @@ -0,0 +1 @@ +signatures, function call and result of the function execution to contents function_call_content = response . candidates [ 0 ] . content # Append the model's function call message, which includes thought signatures contents . append ( function_call_content ) contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Step 4: Create user friendly response with function result and call the model again // ...Create a function response part (No change) // Append thought signatures, function call and result of the function execution to contents const function_response_content = response . candidates [ 0 ]. content ; contents . push ( function_response_content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); The following shows what a request returning a thought signature may look like: [{ "contents" : [ { "role" : "user" , "parts" : [ { "text" : "what is the weather in Lake Tahoe?" } ] } , { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/b6f1449f-1edf-4a66-8f49-56b540c7e1ae b/docstore/b6f1449f-1edf-4a66-8f49-56b540c7e1ae new file mode 100644 index 0000000000000000000000000000000000000000..caeb3ce515fa32fbe54079666313c34a9d6ea8d0 --- /dev/null +++ b/docstore/b6f1449f-1edf-4a66-8f49-56b540c7e1ae @@ -0,0 +1 @@ +picked up automatically by the client when using the Gemini API libraries . Otherwise you will need to pass your API key as an argument when initializing the client. Note that all code samples in the Gemini API docs assume that you have set the environment variable GEMINI_API_KEY . Python from google import genai # The client gets the API key from the environment variable `GEMINI_API_KEY`. client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Explain how AI works in a few words" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; // The client gets the API key from the environment variable `GEMINI_API_KEY`. const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works in a few words" , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () // The client gets the API key from the environment variable `GEMINI_API_KEY`. client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , err := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) if err != nil { log . Fatal ( err ) } fmt . Println ( result . Text ()) } Java package com.example ; import com.google.genai.Client ; import com.google.genai.types.GenerateContentResponse ; public class GenerateTextFromTextInput { public static void main ( String [] args ) { // The client gets the API key from the environment variable `GEMINI_API_KEY`. Client client = new Client (); GenerateContentResponse response = client . models . generateContent ( "gemini-2.5-flash" , "Explain how AI works in a few words" , null ); System . out . println ( response . text ()); } } Apps Script // See \ No newline at end of file diff --git a/docstore/b6f3e6e5-4f7f-42f5-b662-5616e9f875d9 b/docstore/b6f3e6e5-4f7f-42f5-b662-5616e9f875d9 new file mode 100644 index 0000000000000000000000000000000000000000..b99824a0bb181cb1be6367ec11bfeefdd4ec4b3d --- /dev/null +++ b/docstore/b6f3e6e5-4f7f-42f5-b662-5616e9f875d9 @@ -0,0 +1 @@ +AUDIO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Describe this audio clip"}, {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": ' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json To learn more about working with media files, see Files API . Pass audio data inline Instead of uploading an audio file, you can pass inline audio data in the request to generateContent : Python from google.genai import types with open ( 'path/to/small-sample.mp3' , 'rb' ) as f : audio_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ 'Describe this audio clip' , types . Part . from_bytes ( data = audio_bytes , mime_type = 'audio/mp3' , ) ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const base64AudioFile = fs . readFileSync ( "path/to/small-sample.mp3" , { encoding : "base64" , }); const contents = [ { text : "Please summarize the audio." }, { inlineData : { mimeType : "audio/mp3" , data : base64AudioFile , }, }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } audioBytes , _ := os . ReadFile ( "/path/to/small-sample.mp3" ) parts := [] * genai . Part { genai . NewPartFromText ( "Describe this audio clip" ), & genai . \ No newline at end of file diff --git a/docstore/b714e6e6-7d16-415c-a715-375bb174c928 b/docstore/b714e6e6-7d16-415c-a715-375bb174c928 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/b714e6e6-7d16-415c-a715-375bb174c928 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/b719512e-7e8a-4300-96a1-5c1a441af31f b/docstore/b719512e-7e8a-4300-96a1-5c1a441af31f new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/b719512e-7e8a-4300-96a1-5c1a441af31f @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/b72fa9aa-b7d8-411b-9712-0a2b22e6d47a b/docstore/b72fa9aa-b7d8-411b-9712-0a2b22e6d47a new file mode 100644 index 0000000000000000000000000000000000000000..45d17d6ac3f5b7951c085e76c40e76fbe5fe62ea --- /dev/null +++ b/docstore/b72fa9aa-b7d8-411b-9712-0a2b22e6d47a @@ -0,0 +1 @@ +"thinkingConfig": { "thinkingBudget": 1024 # Thinking off: # "thinkingBudget": 0 # Turn on dynamic thinking: # "thinkingBudget": -1 } } }' Thought summaries Thought summaries are synthesized versions of the model's raw thoughts and offer insights into the model's internal reasoning process. Note that thinking budgets apply to the model's raw thoughts and not to thought summaries. You can enable thought summaries by setting includeThoughts to true in your request configuration. You can then access the summary by iterating through the response parameter's parts , and checking the thought boolean. Here's an example demonstrating how to enable and retrieve thought summaries without streaming, which returns a single, final thought summary with the response: Python from google import genai from google.genai import types client = genai . Client () prompt = "What is the sum of the first 50 prime numbers?" response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = prompt , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( include_thoughts = True ) ) ) for part in response . candidates [ 0 ] . content . parts : if not part . text : continue if part . thought : print ( "Thought summary:" ) print ( part . text ) print () else : print ( "Answer:" ) print ( part . text ) print () JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : "What is the sum of the first 50 prime numbers?" , config : { thinkingConfig : { includeThoughts : true , }, }, }); for ( const part of response . candidates [ 0 ]. content . parts ) { if ( ! part . text ) { continue ; } else if ( part . thought ) { console . log ( "Thoughts summary:" ); console . log ( part . text ); } else { console . log ( "Answer:" ); console . log ( part . text ); } } } main (); Go package main import ( "context" \ No newline at end of file diff --git a/docstore/b749f457-6f40-46eb-a172-ca83b66408d2 b/docstore/b749f457-6f40-46eb-a172-ca83b66408d2 new file mode 100644 index 0000000000000000000000000000000000000000..7e8ad5dab4abf30afa5f2a7a1bb2bb58fcb2f5d0 --- /dev/null +++ b/docstore/b749f457-6f40-46eb-a172-ca83b66408d2 @@ -0,0 +1 @@ +-X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"' " $MIME_TYPE " '", "data": "' " $IMAGE_B64 " '" } }, {"text": "Caption this image."} ] }] }' 2 > /dev/null Note: Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly. Uploading images using the File API For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to generateContent . See the Files API guide for more information and examples. Python from google import genai client = genai . Client () my_file = client . files . upload ( file = "path/to/sample.jpg" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ my_file , "Caption this image." ], ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.jpg" , config : { mimeType : "image/jpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Caption this image." , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.jpg" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Caption this image." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * \ No newline at end of file diff --git a/docstore/b762c8d1-5d4d-47a5-b04c-0e7cab58cdc5 b/docstore/b762c8d1-5d4d-47a5-b04c-0e7cab58cdc5 new file mode 100644 index 0000000000000000000000000000000000000000..76ace95fe58e8b11b8d859b0314962fe72bedd09 --- /dev/null +++ b/docstore/b762c8d1-5d4d-47a5-b04c-0e7cab58cdc5 @@ -0,0 +1 @@ +() { const payload = { contents : [ { parts : [ { text : 'Explain how AI works' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Multi-turn conversations (Chat) Our SDKs provide functionality to collect multiple rounds of prompts and responses into a chat, giving you an easy way to keep track of the conversation history. Note: Chat functionality is only implemented as part of the SDKs. Behind the scenes, it still uses the generateContent API. For multi-turn conversations, the full conversation history is sent to the model with each follow-up turn. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message ( "I have 2 dogs in my house." ) print ( response . text ) response = chat . send_message ( "How many paws are in my house?" ) print ( response . text ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const response1 = await chat . sendMessage ({ message : "I have 2 dogs in my house." , }); console . log ( "Chat response 1:" , response1 . text ); const response2 = await chat . sendMessage ({ message : "How many paws are in \ No newline at end of file diff --git a/docstore/b77e4e9a-7122-4a91-b074-eb3e6d0e4ee4 b/docstore/b77e4e9a-7122-4a91-b074-eb3e6d0e4ee4 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/b77e4e9a-7122-4a91-b074-eb3e6d0e4ee4 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/b7b8c799-cab9-4c70-adae-82a6e0e25392 b/docstore/b7b8c799-cab9-4c70-adae-82a6e0e25392 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/b7b8c799-cab9-4c70-adae-82a6e0e25392 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/b7bbbae9-726b-4e77-aaf1-3c4416e5515a b/docstore/b7bbbae9-726b-4e77-aaf1-3c4416e5515a new file mode 100644 index 0000000000000000000000000000000000000000..203c1e44cd8be8f0144d13a11f43fb822930ab15 --- /dev/null +++ b/docstore/b7bbbae9-726b-4e77-aaf1-3c4416e5515a @@ -0,0 +1 @@ +] // MCP Server }); const client = new Client ( { name : "example-client" , version : "1.0.0" } ); // Configure the client const ai = new GoogleGenAI ({}); // Initialize the connection between client and server await client . connect ( serverParams ); // Send request to the model with MCP tools const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : `What is the weather in London in ${ new Date (). toLocaleDateString () } ?` , config : { tools : [ mcpToTool ( client )], // uses the session, will automatically call the tool // Uncomment if you **don't** want the sdk to automatically call the tool // automaticFunctionCalling: { // disable: true, // }, }, }); console . log ( response . text ) // Close the connection await client . close (); Limitations with built-in MCP support Built-in MCP support is a experimental feature in our SDKs and has the following limitations: Only tools are supported, not resources nor prompts It is available for the Python and JavaScript/TypeScript SDK. Breaking changes might occur in future releases. Manual integration of MCP servers is always an option if these limit what you're building. Supported models This section lists models and their function calling capabilities. Experimental models are not included. You can find a comprehensive capabilities overview on the model overview page. Model Function Calling Parallel Function Calling Compositional Function Calling Gemini 2.5 Pro ✔️ ✔️ ✔️ Gemini 2.5 Flash ✔️ ✔️ ✔️ Gemini 2.5 Flash-Lite ✔️ ✔️ ✔️ Gemini 2.0 Flash ✔️ ✔️ ✔️ Gemini 2.0 Flash-Lite X X X Best practices Function and Parameter Descriptions: Be extremely clear and specific in your descriptions. The model relies on these to choose the correct function and provide appropriate arguments. Naming: Use descriptive function names (without spaces, periods, or dashes). Strong Typing: Use specific types (integer, string, enum) for parameters to reduce errors. If a parameter has a limited set of valid \ No newline at end of file diff --git a/docstore/b7ca1613-2350-4c52-99a7-187b89ffe93d b/docstore/b7ca1613-2350-4c52-99a7-187b89ffe93d new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/b7ca1613-2350-4c52-99a7-187b89ffe93d @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/b7ea5c29-02af-41ed-89b1-4a4ce073989f b/docstore/b7ea5c29-02af-41ed-89b1-4a4ce073989f new file mode 100644 index 0000000000000000000000000000000000000000..54ff3139001cad531cb76ca5ae25b2688a321ffa --- /dev/null +++ b/docstore/b7ea5c29-02af-41ed-89b1-4a4ce073989f @@ -0,0 +1 @@ +angle," "worms eye," "dolly shot," "zoom shot," "pan shot," and "tracking shot." Focus and lens effects: Use terms like "shallow focus," "deep focus," "soft focus," "macro lens," and "wide-angle lens" to achieve specific visual effects. Overall style and subject: Guide Veo's creative direction by specifying styles like "sci-fi," "romantic comedy," "action movie," or "animation." You can also describe the subjects and backgrounds you want, such as "cityscape," "nature," "vehicles," or "animals." Veo prompt guide This section of the Veo guide contains examples of videos you can create using Veo, and shows you how to modify prompts to produce distinct results. Safety filters Veo applies safety filters across Gemini to help ensure that generated videos and uploaded photos don't contain offensive content. Prompts that violate our terms and guidelines are blocked. Prompt writing basics Good prompts are descriptive and clear. To get your generated video as close as possible to what you want, start with identifying your core idea, and then refine your idea by adding keywords and modifiers. The following elements should be included in your prompt: Subject : The object, person, animal, or scenery that you want in your video. Context : The background or context in which the subject is placed. Action : What the subject is doing (for example, walking , running , or turning their head ). Style : This can be general or very specific. Consider using specific film style keywords, such as horror film , film noir , or animated styles like cartoon style. Camera motion : [Optional] What the camera is doing, such as aerial view , eye-level , top-down shot , or low-angle shot . Composition : [Optional] How the shot is framed, such as wide shot , close-up , or extreme close-up . Ambiance : [Optional] How the color and light contribute to the scene, such as blue tones , night , or warm tones . More tips for writing prompts The following tips help you write prompts that generate your videos: \ No newline at end of file diff --git a/docstore/b80f3426-72a9-48ed-ba6d-14d3010069e1 b/docstore/b80f3426-72a9-48ed-ba6d-14d3010069e1 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/b80f3426-72a9-48ed-ba6d-14d3010069e1 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/b813b478-571f-4746-9628-8291bf645b0c b/docstore/b813b478-571f-4746-9628-8291bf645b0c new file mode 100644 index 0000000000000000000000000000000000000000..e8472db3f1b37b3dc2d13dd06406d0e42fc75dfb --- /dev/null +++ b/docstore/b813b478-571f-4746-9628-8291bf645b0c @@ -0,0 +1 @@ +costing 258 tokens. Tips and best practices Verify that images are correctly rotated. Use clear, non-blurry images. When using a single image with text, place the text prompt after the image part in the contents array. What's next This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources: Files API : Learn more about uploading and managing files for use with Gemini. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/b820b9c9-ca15-45ad-ab9b-99740f5bc1f3 b/docstore/b820b9c9-ca15-45ad-ab9b-99740f5bc1f3 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/b820b9c9-ca15-45ad-ab9b-99740f5bc1f3 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/b820f6f6-5dd1-47c0-820d-943ee60cffa5 b/docstore/b820f6f6-5dd1-47c0-820d-943ee60cffa5 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/b820f6f6-5dd1-47c0-820d-943ee60cffa5 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/b829fab7-e089-494d-8437-738320525e10 b/docstore/b829fab7-e089-494d-8437-738320525e10 new file mode 100644 index 0000000000000000000000000000000000000000..37fa730aa9280f3cac34df0c8f8ecdd2b308e691 --- /dev/null +++ b/docstore/b829fab7-e089-494d-8437-738320525e10 @@ -0,0 +1 @@ +operation . response . generated_videos ): client . files . download ( file = generated_video . video ) generated_video . video . save ( f "video { n } .mp4" ) # save the video JavaScript import { GoogleGenAI } from "@google/genai" ; import { createWriteStream } from "fs" ; import { Readable } from "stream" ; const ai = new GoogleGenAI ({}); async function main () { let operation = await ai . models . generateVideos ({ model : "veo-2.0-generate-001" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , config : { personGeneration : "dont_allow" , aspectRatio : "16:9" , }, }); while ( ! operation . done ) { await new Promise (( resolve ) = > setTimeout ( resolve , 10000 )); operation = await ai . operations . getVideosOperation ({ operation : operation , }); } operation . response ? . generatedVideos ? . forEach ( async ( generatedVideo , n ) = > { const resp = await fetch ( ` ${ generatedVideo . video ? . uri } & key=GEMINI_API_KEY` ); // append your API key const writer = createWriteStream ( `video ${ n } .mp4` ); Readable . fromWeb ( resp . body ). pipe ( writer ); }); } main (); Go package main import ( "context" "fmt" "os" "time" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } videoConfig := & genai . GenerateVideosConfig { AspectRatio : "16:9" , PersonGeneration : "dont_allow" , } operation , _ := client . Models . GenerateVideos ( ctx , "veo-2.0-generate-001" , "Panning wide shot of a calico kitten sleeping in the sunshine" , nil , videoConfig , ) for ! operation . Done { time . Sleep ( 20 * time . Second ) operation , _ = client . Operations . GetVideosOperation ( ctx , operation , nil ) } for n , video := range operation . Response . GeneratedVideos { client . Files . Download ( ctx , video . Video , nil ) fname := fmt . Sprintf ( "video_%d.mp4" , n ) _ = os . WriteFile ( fname , video . Video . VideoBytes , 0644 ) } } REST # \ No newline at end of file diff --git a/docstore/b8378285-c9ed-40d4-aae0-2848aa4e5038 b/docstore/b8378285-c9ed-40d4-aae0-2848aa4e5038 new file mode 100644 index 0000000000000000000000000000000000000000..c19e2d1f2a8624bd1e7529d33c658d3154e40942 --- /dev/null +++ b/docstore/b8378285-c9ed-40d4-aae0-2848aa4e5038 @@ -0,0 +1 @@ +response. A token is approximately four characters. 100 tokens correspond to roughly 60-80 words. Temperature: The temperature controls the degree of randomness in token selection. The temperature is used for sampling during response generation, which occurs when topP and topK are applied. Lower temperatures are good for prompts that require a more deterministic or less open-ended response, while higher temperatures can lead to more diverse or creative results. A temperature of 0 is deterministic, meaning that the highest probability response is always selected. topK : The topK parameter changes how the model selects tokens for output. A topK of 1 means the selected token is the most probable among all the tokens in the model's vocabulary (also called greedy decoding), while a topK of 3 means that the next token is selected from among the 3 most probable using the temperature. For each token selection step, the topK tokens with the highest probabilities are sampled. Tokens are then further filtered based on topP with the final token selected using temperature sampling. topP : The topP parameter changes how the model selects tokens for output. Tokens are selected from the most to least probable until the sum of their probabilities equals the topP value. For example, if tokens A, B, and C have a probability of 0.3, 0.2, and 0.1 and the topP value is 0.5, then the model will select either A or B as the next token by using the temperature and exclude C as a candidate. The default topP value is 0.95. stop_sequences : Set a stop sequence to tell the model to stop generating content. A stop sequence can be any sequence of characters. Try to avoid using a sequence of characters that may appear in the generated content. Prompt iteration strategies Prompt design can sometimes require a few iterations before you consistently get the response you're looking for. This section provides guidance on some things you can try when iterating on your prompts: Use different phrasing: \ No newline at end of file diff --git a/docstore/b87da0c5-71f8-4862-abb9-158155cf696b b/docstore/b87da0c5-71f8-4862-abb9-158155cf696b new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/b87da0c5-71f8-4862-abb9-158155cf696b @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/b87f58cd-375d-4873-9ee2-c9eae6a848d3 b/docstore/b87f58cd-375d-4873-9ee2-c9eae6a848d3 new file mode 100644 index 0000000000000000000000000000000000000000..5a239077eb9b54aa8536dab0744a2c5cf91f3c3b --- /dev/null +++ b/docstore/b87f58cd-375d-4873-9ee2-c9eae6a848d3 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models#gemini-1.5-flash-8b Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/b8835d87-6b83-41e5-9ccf-77b9eb0a9b27 b/docstore/b8835d87-6b83-41e5-9ccf-77b9eb0a9b27 new file mode 100644 index 0000000000000000000000000000000000000000..33a8b238b28b3b4e6fb2252f6f1e5e7807510cc2 --- /dev/null +++ b/docstore/b8835d87-6b83-41e5-9ccf-77b9eb0a9b27 @@ -0,0 +1 @@ +used to create the audio response: Native audio : This option provides the most natural and realistic-sounding speech and better multilingual performance. It also enables advanced features like affective (emotion-aware) dialogue , proactive audio (where the model can decide to ignore or respond to certain inputs), and "thinking" . Native audio is supported by the following native audio models : gemini-2.5-flash-preview-native-audio-dialog gemini-2.5-flash-exp-native-audio-thinking-dialog Half-cascade audio : This option uses a cascaded model architecture (native audio input and text-to-speech output). It offers better performance and reliability in production environments, especially with tool use . Half-cascaded audio is supported by the following models: gemini-live-2.5-flash-preview gemini-2.0-flash-live-001 Choose an implementation approach When integrating with Live API, you'll need to choose one of the following implementation approaches: Server-to-server : Your backend connects to the Live API using WebSockets . Typically, your client sends stream data (audio, video, text) to your server, which then forwards it to the Live API. Client-to-server : Your frontend code connects directly to the Live API using WebSockets to stream data, bypassing your backend. Note: Client-to-server generally offers better performance for streaming audio and video, since it bypasses the need to send the stream to your backend first. It's also easier to set up since you don't need to implement a proxy that sends data from your client to your server and then your server to the API. However, for production environments, in order to mitigate security risks, we recommend using ephemeral tokens instead of standard API keys. Get started This example reads a WAV file , sends it in the correct format, and saves the received data as WAV file. You can send audio by converting it to 16-bit PCM, 16kHz, mono format, and you can receive audio by setting AUDIO as response modality. The output uses \ No newline at end of file diff --git a/docstore/b8a4aef4-5b26-47f1-87c4-16761635ffd2 b/docstore/b8a4aef4-5b26-47f1-87c4-16761635ffd2 new file mode 100644 index 0000000000000000000000000000000000000000..a76efec9a9a3e7390e77e9a866cc227646391c5b --- /dev/null +++ b/docstore/b8a4aef4-5b26-47f1-87c4-16761635ffd2 @@ -0,0 +1 @@ +Billing | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Billing This guide provides an overview of different Gemini API billing options, explains how to enable billing and monitor usage, and provides answers to frequently asked questions (FAQs) about billing. Upgrade to the Gemini API paid tier About billing Billing for the Gemini API is based on two pricing tiers: free of charge (or free ) and pay-as-you-go (or paid ). Pricing and rate limits differ between these tiers and also vary by model. You can check out the rate limits and pricing pages for more into. For a model-by-model breakdown of capabilities, see the Gemini models page . How to request an upgrade To transition from the free tier to the pay-as-you-go plan, you need to enable billing for your Google Cloud project. The button you see in Google AI Studio depends on your project's current plan. If you're on the free tier, you'll see a Set up Billing button for your project. If you're already on the paid tier and meet the criteria for a plan change, you might see an Upgrade button. To start the process, follow these steps: Go to the AI Studio API keys page . Find the project you want to move to the paid plan and click either Set up Billing or Upgrade , depending on the button displayed. The next step depends on the button you clicked: If you clicked Set up Billing: You'll be redirected to the Google Cloud console to link a billing account to your project. Follow the on-screen instructions to complete the process. If you clicked Upgrade: The system will automatically verify your project's eligibility. If your project meets all the requirements, it will be instantly upgraded to \ No newline at end of file diff --git a/docstore/b8a8aee2-04b4-4958-8223-0f4ac75c29fd b/docstore/b8a8aee2-04b4-4958-8223-0f4ac75c29fd new file mode 100644 index 0000000000000000000000000000000000000000..96b87b83020710118b1637c95a4f4b11475a3569 --- /dev/null +++ b/docstore/b8a8aee2-04b4-4958-8223-0f4ac75c29fd @@ -0,0 +1 @@ +thinking models in general, on the Thinking page. Parallel function calling In addition to single turn function calling, you can also call multiple functions at once. Parallel function calling lets you execute multiple functions at once and is used when the functions are not dependent on each other. This is useful in scenarios like gathering data from multiple independent sources, such as retrieving customer details from different databases or checking inventory levels across various warehouses or performing multiple actions such as converting your apartment into a disco. Python power_disco_ball = { "name" : "power_disco_ball" , "description" : "Powers the spinning disco ball." , "parameters" : { "type" : "object" , "properties" : { "power" : { "type" : "boolean" , "description" : "Whether to turn the disco ball on or off." , } }, "required" : [ "power" ], }, } start_music = { "name" : "start_music" , "description" : "Play some music matching the specified parameters." , "parameters" : { "type" : "object" , "properties" : { "energetic" : { "type" : "boolean" , "description" : "Whether the music is energetic or not." , }, "loud" : { "type" : "boolean" , "description" : "Whether the music is loud or not." , }, }, "required" : [ "energetic" , "loud" ], }, } dim_lights = { "name" : "dim_lights" , "description" : "Dim the lights." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "number" , "description" : "The brightness of the lights, 0.0 is off, 1.0 is full." , } }, "required" : [ "brightness" ], }, } JavaScript import { Type } from '@google/genai' ; const powerDiscoBall = { name : 'power_disco_ball' , description : 'Powers the spinning disco ball.' , parameters : { type : Type . OBJECT , properties : { power : { type : Type . BOOLEAN , description : 'Whether to turn the disco ball on or off.' } }, required : [ 'power' ] } }; const startMusic = { name : 'start_music' , description : 'Play some music matching the specified parameters.' , \ No newline at end of file diff --git a/docstore/b8b2a6f1-dd4d-4da5-9a23-9f7f0ae15f95 b/docstore/b8b2a6f1-dd4d-4da5-9a23-9f7f0ae15f95 new file mode 100644 index 0000000000000000000000000000000000000000..e7df3841ca47d093238d5ecfeb9f7cc95942f8e5 --- /dev/null +++ b/docstore/b8b2a6f1-dd4d-4da5-9a23-9f7f0ae15f95 @@ -0,0 +1 @@ +data image2Path := "path/to/image2.jpeg" imgBytes , _ := os . ReadFile ( image2Path ) parts := [] * genai . Part { genai . NewPartFromText ( "What is different between these two images?" ), genai . NewPartFromBytes ( imgBytes , "image/jpeg" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST # Upload the first image IMAGE1_PATH = "path/to/image1.jpg" MIME1_TYPE = $( file -b --mime-type " ${ IMAGE1_PATH } " ) NUM1_BYTES = $( wc -c < " ${ IMAGE1_PATH } " ) DISPLAY_NAME1 = IMAGE1 tmp_header_file1 = upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME1_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME1 } '}}" 2 > /dev/null upload_url1 = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file1 } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file1 } " curl " ${ upload_url1 } " \ -H "Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE1_PATH } " 2 > /dev/null > file_info1.json file1_uri = $( jq ".file.uri" file_info1.json ) echo file1_uri = $file1_uri # Prepare the second image (inline) IMAGE2_PATH = "path/to/image2.png" MIME2_TYPE = $( file -b --mime-type " ${ IMAGE2_PATH } " ) if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMAGE2_BASE64 = $( base64 $B64FLAGS $IMAGE2_PATH ) # Now generate content using both images curl \ No newline at end of file diff --git a/docstore/b9208020-13bb-4fce-abc3-6dc27fc2a4d2 b/docstore/b9208020-13bb-4fce-abc3-6dc27fc2a4d2 new file mode 100644 index 0000000000000000000000000000000000000000..5db8bfd80021d23905dc72cb076a591abebf4c74 --- /dev/null +++ b/docstore/b9208020-13bb-4fce-abc3-6dc27fc2a4d2 @@ -0,0 +1 @@ +Tahoe?" using the Get Weather example: Text part [{ "candidates" : [ { "content" : { "parts" : [ { "text" : "Here's what the weather in Lake Tahoe is today" , "thoughtSignature" : "ClcBVKhc7ru7KzUI7SrdUoIdAYLm/+i93aHjfIt4xHyAoO/G70tApxnK2ujBhOhC1PrRy1pkQa88fqFvpHNVd1HDjNLO7mkp6/hFwE+SPPEB3fh0hs4oM8MKhgIBVKhc7uIGvrS7i/T4HpfbnYrluFfWNjZ62gewqe4cVdR/Dlh+zbjtYmDD0gPZ+SuBO7vvHQdzsjePRP+2Y5XddX6LEf/cGGgakq8EhVvw/a6IVzUO6XmpHg2Ag1sl8E9+VFH/lC0R0ZuYdFWligtDuYwp5p5q3o59G0TtWeU2MC1y2MJfE9u/KWd313ldka80/X2W/xF2O/4djMp5G2WKcULfve75zeRCy0mc5iS3SB9mTH0cT6x0vtKjeBx50gcg+CQWtJcRuwTVzz54dmvmK9xvnqA8gKGw3DuaM9wfy5hyY7Qg0z3iyyWdP8T/lbjKim8IEQOk7O1vVwP1Ko7oMYH8JgA1CsoBAVSoXO6v4c5RSyd1cn6EIU0pEFQsjW7rYWPuZdOFq/tsGJT9BCfW7KGkPGwlNSq8jTJFvbcJ/DjtndISQYXwiXd2kGa5JfdS2Kh4zOxCxiWtOk+2nCc3+XQk2nonhO+esGJpkDdbbHZSqRgcUtYKq7q28iPFOQvOFyCiZNB7K86Z/6Hnagu2snSlN/BcTMaFGaWpcCClSUo4foRZn3WbNCoM8rcpD7qEJMp4a5baaSxyyeL1ZTGd2HLpFys/oiW6e3oAnhxuIysCwg==" } ] , "role" : "model" } , "index" : 0 } ] , # Remainder of response... Function call part [{ "candidates" : [ { "content" : { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : "CiwBVKhc7nRyTi3HmggPD9iQiRc261f5jwuMdw3H/itDH0emsb9ZVo3Nwx9p6wpsAVSoXO5i8fDV4jBSBLoaWxB5zUdlGY6aIGp+I0oEnwRRSRQ1LOvrDlojEH8JE8HjiKXALdJrvNPiG+HY3GZEO8pZjEZtc3UoBUh7+SVyjK7Xolu7aRYYeUyzrCapoETWypER1jbrJXnFV23hCosBAVSoXO6oIPNJSmbuEDfGafOhuCSHkpr1yjTp35RXYqmCESzRzWf5+nFXLqncqeFo4ohoxbiYQVpVQbOZF81p8o9zg6xeRE7qMeOv+XN7enXGJ4/s3qNFQpfkSMqRdBITN1VpX7jyfEAjvxBNc7PDfDJZmEPY338ZIY5nFFcmzJSWjVrboFt2sMFv+A==" } ] , "role" : "model" } , "finishReason" : "STOP" , "index" : 0 } ] , # Remainder of response... You can confirm that you received a signature and see what a signature looks like using the following code: # Step 2: Call the model with function declarations # ...Generation config, Configure the client, and Define user prompt (No changes) # Send request with declarations (using a thinking model) response = client . \ No newline at end of file diff --git a/docstore/b935070a-af02-4b17-910e-160ca36842e4 b/docstore/b935070a-af02-4b17-910e-160ca36842e4 new file mode 100644 index 0000000000000000000000000000000000000000..b176d26cecfbef2389a0ccc241266a4a9ec73ca3 --- /dev/null +++ b/docstore/b935070a-af02-4b17-910e-160ca36842e4 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/document-processing#main-content Title: Document understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/b940d5a1-8320-46e8-99a3-90a9f3af1a5e b/docstore/b940d5a1-8320-46e8-99a3-90a9f3af1a5e new file mode 100644 index 0000000000000000000000000000000000000000..e83e80a217e9ff779ee42e75d11ac96b01a2f185 --- /dev/null +++ b/docstore/b940d5a1-8320-46e8-99a3-90a9f3af1a5e @@ -0,0 +1 @@ +function_calling_config = types . FunctionCallingConfig ( mode = "ANY" , allowed_function_names = [ "get_current_temperature" ] ) ) # Create the generation config config = types . GenerateContentConfig ( tools = [ tools ], # not defined here. tool_config = tool_config , ) JavaScript import { FunctionCallingConfigMode } from '@google/genai' ; // Configure function calling mode const toolConfig = { functionCallingConfig : { mode : FunctionCallingConfigMode . ANY , allowedFunctionNames : [ 'get_current_temperature' ] } }; // Create the generation config const config = { tools : tools , // not defined here. toolConfig : toolConfig , }; Automatic function calling (Python only) When using the Python SDK, you can provide Python functions directly as tools. The SDK automatically converts the Python function to declarations, handles the function call execution and the response cycle for you. The Python SDK then automatically: Detects function call responses from the model. Call the corresponding Python function in your code. Sends the function response back to the model. Returns the model's final text response. To use this, define your function with type hints and a docstring, and then pass the function itself (not a JSON declaration) as a tool: Python from google import genai from google.genai import types # Define the function with type hints and docstring def get_current_temperature ( location : str ) - > dict : """Gets the current temperature for a given location. Args: location: The city and state, e.g. San Francisco, CA Returns: A dictionary containing the temperature and unit. """ # ... (implementation) ... return { "temperature" : 25 , "unit" : "Celsius" } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_current_temperature ] ) # Pass the function itself # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What's the temperature in Boston?" , config = config \ No newline at end of file diff --git a/docstore/b9468b61-a53d-4c98-b2a8-1b1df61d85f6 b/docstore/b9468b61-a53d-4c98-b2a8-1b1df61d85f6 new file mode 100644 index 0000000000000000000000000000000000000000..7e8ad5dab4abf30afa5f2a7a1bb2bb58fcb2f5d0 --- /dev/null +++ b/docstore/b9468b61-a53d-4c98-b2a8-1b1df61d85f6 @@ -0,0 +1 @@ +-X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"' " $MIME_TYPE " '", "data": "' " $IMAGE_B64 " '" } }, {"text": "Caption this image."} ] }] }' 2 > /dev/null Note: Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly. Uploading images using the File API For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to generateContent . See the Files API guide for more information and examples. Python from google import genai client = genai . Client () my_file = client . files . upload ( file = "path/to/sample.jpg" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ my_file , "Caption this image." ], ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.jpg" , config : { mimeType : "image/jpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Caption this image." , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.jpg" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Caption this image." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * \ No newline at end of file diff --git a/docstore/b94a697b-38dd-4e88-a37e-1d4f8cbdebc7 b/docstore/b94a697b-38dd-4e88-a37e-1d4f8cbdebc7 new file mode 100644 index 0000000000000000000000000000000000000000..84742c0b7906ca5a168857eb7577a7e191bcdffb --- /dev/null +++ b/docstore/b94a697b-38dd-4e88-a37e-1d4f8cbdebc7 @@ -0,0 +1 @@ +prompt - urban background, man-made structures, dark, stormy, or threatening atmosphere. Aspect ratios Gemini Veo video generation supports the following two aspect ratios: Aspect ratio Description Widescreen or 16:9 The most common aspect ratio for televisions, monitors, and mobile phone screens (landscape). Use this when you want to capture more of the background, like in scenic landscapes. Portrait or 9:16 Rotated widescreen. This aspect ratio has been popularized by short form video applications, such as Youtube shorts. Use this for portraits or tall objects with strong vertical orientations, such as buildings, trees, waterfall, or buildings. Widescreen This prompt is an example of the widescreen aspect ratio of 16:9. Prompt Generated output Create a video with a tracking drone view of a man driving a red convertible car in Palm Springs, 1970s, warm sunlight, long shadows. Portrait This prompt is an example of the portrait aspect ratio of 9:16. Prompt Generated output Create a video highlighting the smooth motion of a majestic Hawaiian waterfall within a lush rainforest. Focus on realistic water flow, detailed foliage, and natural lighting to convey tranquility. Capture the rushing water, misty atmosphere, and dappled sunlight filtering through the dense canopy. Use smooth, cinematic camera movements to showcase the waterfall and its surroundings. Aim for a peaceful, realistic tone, transporting the viewer to the serene beauty of the Hawaiian rainforest. What's next Gain more experience generating AI videos with the Veo Colab . Check out cool examples using Veo 2 on the Google DeepMind site Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/b9853948-3162-4b3a-b6b5-8d38c4a62e9b b/docstore/b9853948-3162-4b3a-b6b5-8d38c4a62e9b new file mode 100644 index 0000000000000000000000000000000000000000..a09e2ec8fabf6fa2a2833131a9e374f2b71a5ec1 --- /dev/null +++ b/docstore/b9853948-3162-4b3a-b6b5-8d38c4a62e9b @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/thinking#summaries Title: Gemini thinking | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/b9ac713e-17ca-4fdf-ab70-08d29a8c210a b/docstore/b9ac713e-17ca-4fdf-ab70-08d29a8c210a new file mode 100644 index 0000000000000000000000000000000000000000..b1044b06e974ef70df5275060bd78c27b49af935 --- /dev/null +++ b/docstore/b9ac713e-17ca-4fdf-ab70-08d29a8c210a @@ -0,0 +1 @@ +ordering of the examples is not consistent with the property ordering of the schema, the output could be rambling or unexpected. To ensure a consistent, predictable ordering of properties, you can use the optional propertyOrdering[] field. "propertyOrdering" : [ "recipeName" , "ingredients" ] propertyOrdering[] – not a standard field in the OpenAPI specification – is an array of strings used to determine the order of properties in the response. By specifying the order of properties and then providing examples with properties in that same order, you can potentially improve the quality of results. propertyOrdering is only supported when you manually create types.Schema . Schemas in Python When you're using the Python library, the value of response_schema must be one of the following: A type, as you would use in a type annotation (see the Python typing module ) An instance of genai.types.Schema The dict equivalent of genai.types.Schema The easiest way to define a schema is with a Pydantic type (as shown in the previous example): Python config = { 'response_mime_type' : 'application/json' , 'response_schema' : list [ Recipe ]} When you use a Pydantic type, the Python library builds out a JSON schema for you and sends it to the API. For additional examples, see the Python library docs . The Python library supports schemas defined with the following types (where AllowedType is any allowed type): int float bool str list[AllowedType] AllowedType|AllowedType|... For structured types: dict[str, AllowedType] . This annotation declares all dict values to be the same type, but doesn't specify what keys should be included. User-defined Pydantic models . This approach lets you specify the key names and define different types for the values associated with each of the keys, including nested structures. JSON Schema support JSON Schema is a more recent specification than OpenAPI 3.0, which the Schema object is based on. Support for JSON Schema is available as a preview using the \ No newline at end of file diff --git a/docstore/b9b73e36-0581-412c-841e-4149be12e579 b/docstore/b9b73e36-0581-412c-841e-4149be12e579 new file mode 100644 index 0000000000000000000000000000000000000000..ed09bf86b4b3896290a2372bddef4006c085c60d --- /dev/null +++ b/docstore/b9b73e36-0581-412c-841e-4149be12e579 @@ -0,0 +1 @@ +Image generation | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image generation You can generate images using the Gemini API with either Gemini's built-in multimodal capabilities or Imagen, Google's specialized image generation models. For most use cases, start with Gemini . Choose Imagen for specialized tasks where image quality is critical. See Choosing the right model section for more guidance. All generated images include a SynthID watermark . Before you begin Ensure you use a supported model and version for image generation: For Gemini , use Gemini 2.0 Flash Preview Image Generation. For Imagen , use one of the Imagen models (Imagen 3, Imagen 4 or Imagen 4 Ultra). Note that those models are only available on the Paid tier . You can access both Gemini and Imagen models using the same libraries. Note: Image generation may not be available in all regions and countries, review our Models page for more information. Generate images using Gemini Gemini can generate and process images conversationally. You can prompt Gemini with text, images, or a combination of both to achieve various image-related tasks, such as image generation and editing. You must include responseModalities : ["TEXT", "IMAGE"] in your configuration. Image-only output is not supported with these models. Image generation (text-to-image) The following code demonstrates how to generate an image based on a descriptive prompt: Python from google import genai from google.genai import types from PIL import Image from io import BytesIO import base64 client = genai . Client () contents = ( 'Hi, can you create a 3d rendered image of a pig ' 'with wings and a top hat flying \ No newline at end of file diff --git a/docstore/b9d31195-707f-4a04-94bb-18380ecc923f b/docstore/b9d31195-707f-4a04-94bb-18380ecc923f new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/b9d31195-707f-4a04-94bb-18380ecc923f @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/b9f1617b-9d9f-4c78-aaba-1a6877937602 b/docstore/b9f1617b-9d9f-4c78-aaba-1a6877937602 new file mode 100644 index 0000000000000000000000000000000000000000..38b0d511edf99b6e46520aefe0a8854155d4053f --- /dev/null +++ b/docstore/b9f1617b-9d9f-4c78-aaba-1a6877937602 @@ -0,0 +1 @@ +energetic music, and dimmed the lights to 50% brightness. Let's get this party started! Compositional function calling Compositional or sequential function calling allows Gemini to chain multiple function calls together to fulfill a complex request. For example, to answer "Get the temperature in my current location", the Gemini API might first invoke a get_current_location() function followed by a get_weather() function that takes the location as a parameter. The following example demonstrates how to implement compositional function calling using the Python SDK and automatic function calling. Python This example uses the automatic function calling feature of the google-genai Python SDK. The SDK automatically converts the Python functions to the required schema, executes the function calls when requested by the model, and sends the results back to the model to complete the task. import os from google import genai from google.genai import types # Example Functions def get_weather_forecast ( location : str ) - > dict : """Gets the current weather temperature for a given location.""" print ( f "Tool Call: get_weather_forecast(location= { location } )" ) # TODO: Make API call print ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ) return { "temperature" : 25 , "unit" : "celsius" } # Dummy response def set_thermostat_temperature ( temperature : int ) - > dict : """Sets the thermostat to a desired temperature.""" print ( f "Tool Call: set_thermostat_temperature(temperature= { temperature } )" ) # TODO: Interact with a thermostat API print ( "Tool Response: {'status': 'success'}" ) return { "status" : "success" } # Configure the client and model client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_weather_forecast , set_thermostat_temperature ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to \ No newline at end of file diff --git a/docstore/b9f64f0f-78f1-42d5-b46a-4cb79fe849e9 b/docstore/b9f64f0f-78f1-42d5-b46a-4cb79fe849e9 new file mode 100644 index 0000000000000000000000000000000000000000..1d5a02022906f295c3ad625acee2d3f5c63827ae --- /dev/null +++ b/docstore/b9f64f0f-78f1-42d5-b46a-4cb79fe849e9 @@ -0,0 +1 @@ +Aoede -- Breezy Callirrhoe -- Easy-going Autonoe -- Bright Enceladus -- Breathy Iapetus -- Clear Umbriel -- Easy-going Algieba -- Smooth Despina -- Smooth Erinome -- Clear Algenib -- Gravelly Rasalgethi -- Informative Laomedeia -- Upbeat Achernar -- Soft Alnilam -- Firm Schedar -- Even Gacrux -- Mature Pulcherrima -- Forward Achird -- Friendly Zubenelgenubi -- Casual Vindemiatrix -- Gentle Sadachbia -- Lively Sadaltager -- Knowledgeable Sulafat -- Warm You can hear all the voice options in AI Studio . Supported languages The TTS models detect the input language automatically. They support the following 24 languages: Language BCP-47 Code Language BCP-47 Code Arabic (Egyptian) ar-EG German (Germany) de-DE English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Korean (Korea) ko-KR Portuguese (Brazil) pt-BR Russian (Russia) ru-RU Dutch (Netherlands) nl-NL Polish (Poland) pl-PL Thai (Thailand) th-TH Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Romanian (Romania) ro-RO Ukrainian (Ukraine) uk-UA Bengali (Bangladesh) bn-BD English (India) en-IN & hi-IN bundle Marathi (India) mr-IN Tamil (India) ta-IN Telugu (India) te-IN Supported models Model Single speaker Multispeaker Gemini 2.5 Flash Preview TTS ✔️ ✔️ Gemini 2.5 Pro Preview TTS ✔️ ✔️ Limitations TTS models can only receive text inputs and generate audio outputs. A TTS session has a context window limit of 32k tokens. Review Languages section for language support. What's next Try the audio generation cookbook . Gemini's Live API offers interactive audio generation options you can interleave with other modalities. For working with audio inputs , visit the Audio understanding guide. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site \ No newline at end of file diff --git a/docstore/ba5e478e-11e8-4879-a772-be1d44a3cf56 b/docstore/ba5e478e-11e8-4879-a772-be1d44a3cf56 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/ba5e478e-11e8-4879-a772-be1d44a3cf56 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/ba8bf891-4821-4bae-819f-0c728c98261a b/docstore/ba8bf891-4821-4bae-819f-0c728c98261a new file mode 100644 index 0000000000000000000000000000000000000000..4b6418baecebd23eec6598a4eb723dc1516263bd --- /dev/null +++ b/docstore/ba8bf891-4821-4bae-819f-0c728c98261a @@ -0,0 +1 @@ +default media resolution or 6 hours long at low media resolution, while models with a 1M context window can process videos up to 1 hour long at default media resolution or 3 hours long at low media resolution. File API processing : When using the File API, videos are sampled at 1 frame per second (FPS) and audio is processed at 1Kbps (single channel). Timestamps are added every second. These rates are subject to change in the future for improvements in inference. Token calculation : Each second of video is tokenized as follows: Individual frames (sampled at 1 FPS): If mediaResolution is set to low, frames are tokenized at 66 tokens per frame. Otherwise, frames are tokenized at 258 tokens per frame. Audio: 32 tokens per second. Metadata is also included. Total: Approximately 300 tokens per second of video at default media resolution, or 100 tokens per second of video at low media resolution. Timestamp format : When referring to specific moments in a video within your prompt, use the MM:SS format (e.g., 01:15 for 1 minute and 15 seconds). Best practices : Use only one video per prompt request for optimal results. If combining text and a single video, place the text prompt after the video part in the contents array. Be aware that fast action sequences might lose detail due to the 1 FPS sampling rate. Consider slowing down such clips if necessary. What's next This guide shows how to upload video files and generate text outputs from video inputs. To learn more, see the following resources: System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. Files API : Learn more about uploading and managing files for use with Gemini. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. \ No newline at end of file diff --git a/docstore/bab9990a-78b5-4b91-9f39-857401a65a0f b/docstore/bab9990a-78b5-4b91-9f39-857401a65a0f new file mode 100644 index 0000000000000000000000000000000000000000..964387d039ef80cfbc57b203b39f125290e566d4 --- /dev/null +++ b/docstore/bab9990a-78b5-4b91-9f39-857401a65a0f @@ -0,0 +1 @@ +using the text below. Respond with only the text provided. Question: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Text: Color: Slowly pulsing yellow What it means: There is a network error. What to do: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. Color: Fast blinking yellow What it means: You are holding down the reset button and are factory resetting this device. What to do: If you keep holding down the reset button, after about 12 seconds, the light will turn solid yellow. Once it is solid yellow, let go of the factory reset button. Color: Solid yellow What it means: Router is factory resetting. What to do: This can take up to 10 minutes. When it's done, the device will reset itself and start pulsing white, letting you know it's ready for setup. Color: Solid red What it means: Something is wrong. What to do: Critical failure. Factory reset the router. If the light stays red, contact Wifi customer support. Response: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. (gemini-2.5-flash) Add prefixes A prefix is a word or phrase that you add to the prompt content that can serve several purposes, depending on where you put the prefix: Input prefix: Adding a prefix to the input signals semantically meaningful parts of the input to the model. For example, the prefixes "English:" and "French:" demarcate two different languages. Output prefix: Even though the output is generated by the model, you can add a prefix for the output in the prompt. The output prefix gives the model information about what's expected as a response. For example, the output prefix "JSON:" signals to the model that the output should be in JSON format. Example prefix: In few-shot prompts, adding prefixes \ No newline at end of file diff --git a/docstore/bac4cda6-875f-40df-8c62-3619f1033d9b b/docstore/bac4cda6-875f-40df-8c62-3619f1033d9b new file mode 100644 index 0000000000000000000000000000000000000000..46252ab65de04d0708193c9bf210d89ae19e4211 --- /dev/null +++ b/docstore/bac4cda6-875f-40df-8c62-3619f1033d9b @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/function-calling?example=weather#rest Title: Function calling with the Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/baceacf1-4958-4cdc-b093-a60cc7835312 b/docstore/baceacf1-4958-4cdc-b093-a60cc7835312 new file mode 100644 index 0000000000000000000000000000000000000000..d028e190ded4732dff341fe647fe7532c210272b --- /dev/null +++ b/docstore/baceacf1-4958-4cdc-b093-a60cc7835312 @@ -0,0 +1 @@ +second image shows an empty hallway in a shopping mall, which is a place where people often go to buy things for their trips. The third image shows a stack of toilet paper, which is something that people often use when they are traveling. The fourth image shows a dark and stormy sea, which is something that people often encounter when they are traveling. The same tips work for creative output from images. In the following example with the rabbit, explicitly asking the model to include aspects from the image can improve the output. Prompt Model response Can you write me a poem about this image? Little bunny, so soft and brown, Hopping through the field, up and down. Your ears so long, your tail so small, You're the cutest creature of them all. You nibble on the grass, so green and sweet, And hop away when you see me. But I'll always be your friend, Little bunny, until the very end. Updated prompt Improved response Can you write me a descriptive and dramatic poem about this image and include the location? Little bunny on the green, Ears so long, so keen. What do you see out there, In the distance, where you stare? Is it the ocean blue, Sparkling in the sun's hue? Or the waves that crash and roar, A rhythm that you adore? Little bunny, tell me true, What captures your attention so? Is it the salty breeze, Or the seagulls' cries that please? Perhaps it's the vastness of it all, A world so big and small. A place where dreams can roam, And adventures can be sown. Little bunny, I wonder what you dream, As you sit on the grass, so serene. Do you yearn to explore the deep, Or stay on land, where you can leap? Whatever it may be, little bunny, Keep that spark of wonder burning brightly. For in your dreams and aspirations, Lies a world waiting for your creation. Troubleshooting which part of the prompt failed It can be hard to know whether a prompt failed because the model didn't understand the image to begin with, or if it did understand the image but did not perform the \ No newline at end of file diff --git a/docstore/bad11bf1-1c95-4674-b35d-5068b0011ea6 b/docstore/bad11bf1-1c95-4674-b35d-5068b0011ea6 new file mode 100644 index 0000000000000000000000000000000000000000..96b87b83020710118b1637c95a4f4b11475a3569 --- /dev/null +++ b/docstore/bad11bf1-1c95-4674-b35d-5068b0011ea6 @@ -0,0 +1 @@ +thinking models in general, on the Thinking page. Parallel function calling In addition to single turn function calling, you can also call multiple functions at once. Parallel function calling lets you execute multiple functions at once and is used when the functions are not dependent on each other. This is useful in scenarios like gathering data from multiple independent sources, such as retrieving customer details from different databases or checking inventory levels across various warehouses or performing multiple actions such as converting your apartment into a disco. Python power_disco_ball = { "name" : "power_disco_ball" , "description" : "Powers the spinning disco ball." , "parameters" : { "type" : "object" , "properties" : { "power" : { "type" : "boolean" , "description" : "Whether to turn the disco ball on or off." , } }, "required" : [ "power" ], }, } start_music = { "name" : "start_music" , "description" : "Play some music matching the specified parameters." , "parameters" : { "type" : "object" , "properties" : { "energetic" : { "type" : "boolean" , "description" : "Whether the music is energetic or not." , }, "loud" : { "type" : "boolean" , "description" : "Whether the music is loud or not." , }, }, "required" : [ "energetic" , "loud" ], }, } dim_lights = { "name" : "dim_lights" , "description" : "Dim the lights." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "number" , "description" : "The brightness of the lights, 0.0 is off, 1.0 is full." , } }, "required" : [ "brightness" ], }, } JavaScript import { Type } from '@google/genai' ; const powerDiscoBall = { name : 'power_disco_ball' , description : 'Powers the spinning disco ball.' , parameters : { type : Type . OBJECT , properties : { power : { type : Type . BOOLEAN , description : 'Whether to turn the disco ball on or off.' } }, required : [ 'power' ] } }; const startMusic = { name : 'start_music' , description : 'Play some music matching the specified parameters.' , \ No newline at end of file diff --git a/docstore/bad6e7d1-d07a-490a-b7b0-51afc4145c7a b/docstore/bad6e7d1-d07a-490a-b7b0-51afc4145c7a new file mode 100644 index 0000000000000000000000000000000000000000..deed43be9d78353ae146822eb2d40897035c76a7 --- /dev/null +++ b/docstore/bad6e7d1-d07a-490a-b7b0-51afc4145c7a @@ -0,0 +1 @@ +"What other color sofas would work in my space? can you update the image?" Multi-turn image editing (chat): Keep generating / editing images conversationally. Example prompts: [upload an image of a blue car.] , "Turn this car into a convertible.", "Now change the color to yellow." Limitations For best performance, use the following languages: EN, es-MX, ja-JP, zh-CN, hi-IN. Image generation does not support audio or video inputs. Image generation may not always trigger: The model may output text only. Try asking for image outputs explicitly (e.g. "generate an image", "provide images as you go along", "update the image"). The model may stop generating partway through. Try again or try a different prompt. When generating text for an image, Gemini works best if you first generate the text and then ask for an image with the text. There are some regions/countries where Image generation is not available. See Models for more information. Generate images using the Imagen models This example demonstrates generating images with an Imagen model : Python from google import genai from google.genai import types from PIL import Image from io import BytesIO client = genai . Client () response = client . models . generate_images ( model = 'imagen-4.0-generate-preview-06-06' , prompt = 'Robot holding a red skateboard' , config = types . GenerateImagesConfig ( number_of_images = 4 , ) ) for generated_image in response . generated_images : generated_image . image . show () JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . generateImages ({ model : 'imagen-4.0-generate-preview-06-06' , prompt : 'Robot holding a red skateboard' , config : { numberOfImages : 4 , }, }); let idx = 1 ; for ( const generatedImage of response . generatedImages ) { let imgBytes = generatedImage . image . imageBytes ; const buffer = Buffer . from ( imgBytes , "base64" ); fs . \ No newline at end of file diff --git a/docstore/bb0791a1-c06b-4a1c-b155-4b21d8cedd24 b/docstore/bb0791a1-c06b-4a1c-b155-4b21d8cedd24 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/bb0791a1-c06b-4a1c-b155-4b21d8cedd24 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/bb0c8486-d261-41c5-afb4-874905482164 b/docstore/bb0c8486-d261-41c5-afb4-874905482164 new file mode 100644 index 0000000000000000000000000000000000000000..2bc9ee1b64943d2fc9ee4b66d281a35e0e278a02 --- /dev/null +++ b/docstore/bb0c8486-d261-41c5-afb4-874905482164 @@ -0,0 +1 @@ +Session management with Live API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Session management with Live API In the Live API, a session refers to a persistent connection where input and output are streamed continuously over the same connection (read more about how it works ). This unique session design enables low latency and supports unique features, but can also introduce challenges, like session time limits, and early termination. This guide covers strategies for overcoming the session management challenges that can arise when using the Live API. Session lifetime Without compression, audio-only sessions are limited to 15 minutes, and audio-video sessions are limited to 2 minutes. Exceeding these limits will terminate the session (and therefore, the connection), but you can use context window compression to extend sessions to an unlimited amount of time. The lifetime of a connection is limited as well, to around 10 minutes. When the connection terminates, the session terminates as well. In this case, you can configure a single session to stay active over multiple connections using session resumption . You'll also receive a GoAway message before the connection ends, allowing you to take further actions. Context window compression To enable longer sessions, and avoid abrupt connection termination, you can enable context window compression by setting the contextWindowCompression field as part of the session configuration. In the ContextWindowCompressionConfig , you can configure a sliding-window mechanism and the number of tokens that triggers compression. Python from google.genai import types config = types . LiveConnectConfig ( \ No newline at end of file diff --git a/docstore/bb2abdc0-276d-4992-bf45-25be130d6fb0 b/docstore/bb2abdc0-276d-4992-bf45-25be130d6fb0 new file mode 100644 index 0000000000000000000000000000000000000000..36b0f0f8a4df60acd9dd94249f5fced4282af350 --- /dev/null +++ b/docstore/bb2abdc0-276d-4992-bf45-25be130d6fb0 @@ -0,0 +1 @@ +Get started with Live API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Get started with Live API Preview: The Live API is in preview. The Live API enables low-latency, real-time voice and video interactions with Gemini. It processes continuous streams of audio, video, or text to deliver immediate, human-like spoken responses, creating a natural conversational experience for your users. Live API offers a comprehensive set of features such as Voice Activity Detection , tool use and function calling , session management (for managing long running conversations) and ephemeral tokens (for secure client-sided authentication). This page gets you up and running with examples and basic code samples. Example applications Check out the following example applications that illustrate how to use Live API for end-to-end use cases: Live audio starter app on AI Studio, using JavaScript libraries to connect to Live API and stream bidirectional audio through your microphone and speakers. Live API Python cookbook using Pyaudio that connects to Live API. Partner integrations If you prefer a simpler development process, you can use Daily or LiveKit . These are third-party partner platforms that have already integrated the Gemini Live API over the WebRTC protocol to streamline the development of real-time audio and video applications. Before you begin building There are two important decisions to make before you begin building with the Live API: choosing a model and choosing an implementation approach. Choose an audio generation architecture If you're building an audio-based use case, your choice of model determines the audio generation architecture \ No newline at end of file diff --git a/docstore/bb69385e-4e15-46ed-858f-8b698b33e669 b/docstore/bb69385e-4e15-46ed-858f-8b698b33e669 new file mode 100644 index 0000000000000000000000000000000000000000..d276bcbbbbd8ffd587f83aebc6d230d4e5c5b078 --- /dev/null +++ b/docstore/bb69385e-4e15-46ed-858f-8b698b33e669 @@ -0,0 +1 @@ +Gemini Developer API Pricing | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Gemini Developer API Pricing The Gemini API "free tier" is offered through the API service with lower rate limits for testing purposes. Google AI Studio usage is completely free in all available countries. The Gemini API "paid tier" comes with higher rate limits , additional features, and different data handling. Upgrade to the Paid Tier If you're looking to reduce costs and your use case doesn't require immediate real-time responses, check out Batch Mode . Batch Mode is designed to process large volumes of requests asynchronously. Requests submitted using this mode is 50% of the price of interactive (non-batch mode) requests. Gemini 2.5 Pro Try it in Google AI Studio Our state-of-the-art multipurpose model, which excels at coding and complex reasoning tasks. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge $1.25, prompts <= 200k tokens $2.50, prompts > 200k tokens Output price (including thinking tokens) Free of charge $10.00, prompts <= 200k tokens $15.00, prompts > 200k Context caching price Not available $0.31, prompts <= 200k tokens $0.625, prompts > 200k $4.50 / 1,000,000 tokens per hour (storage price) Grounding with Google Search Not available 1,500 RPD (free), then $35 / 1,000 requests Used to improve our products Yes No Gemini 2.5 Flash Try it in Google AI Studio Our first hybrid reasoning model which supports a 1M token context window and has thinking budgets. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge $0.30 (text / image / video) $1.00 (audio) Output price (including thinking tokens) Free of charge $2.50 Context \ No newline at end of file diff --git a/docstore/bb7a2425-ec1a-4451-acae-5bc8bd0cf92e b/docstore/bb7a2425-ec1a-4451-acae-5bc8bd0cf92e new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/bb7a2425-ec1a-4451-acae-5bc8bd0cf92e @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/bb8de655-f14c-42a4-b13c-e314a7f7a14e b/docstore/bb8de655-f14c-42a4-b13c-e314a7f7a14e new file mode 100644 index 0000000000000000000000000000000000000000..a3f65441166ab1d39d6f723c5ce4c04b54ac95e6 --- /dev/null +++ b/docstore/bb8de655-f14c-42a4-b13c-e314a7f7a14e @@ -0,0 +1 @@ +values, use an enum. Tool Selection: While the model can use an arbitrary number of tools, providing too many can increase the risk of selecting an incorrect or suboptimal tool. For best results, aim to provide only the relevant tools for the context or task, ideally keeping the active set to a maximum of 10-20. Consider dynamic tool selection based on conversation context if you have a large total number of tools. Prompt Engineering: Provide context: Tell the model its role (e.g., "You are a helpful weather assistant."). Give instructions: Specify how and when to use functions (e.g., "Don't guess dates; always use a future date for forecasts."). Encourage clarification: Instruct the model to ask clarifying questions if needed. Temperature: Use a low temperature (e.g., 0) for more deterministic and reliable function calls. Validation: If a function call has significant consequences (e.g., placing an order), validate the call with the user before executing it. Error Handling : Implement robust error handling in your functions to gracefully handle unexpected inputs or API failures. Return informative error messages that the model can use to generate helpful responses to the user. Security: Be mindful of security when calling external APIs. Use appropriate authentication and authorization mechanisms. Avoid exposing sensitive data in function calls. Token Limits: Function descriptions and parameters count towards your input token limit. If you're hitting token limits, consider limiting the number of functions or the length of the descriptions, break down complex tasks into smaller, more focused function sets. Notes and limitations Only a subset of the OpenAPI schema is supported. Supported parameter types in Python are limited. Automatic function calling is a Python SDK feature only. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License \ No newline at end of file diff --git a/docstore/bb943f99-67ab-4e39-aad5-d0eadd0e4934 b/docstore/bb943f99-67ab-4e39-aad5-d0eadd0e4934 new file mode 100644 index 0000000000000000000000000000000000000000..c72243f466a39b8c76a4237da0daa8b99ecbbe13 --- /dev/null +++ b/docstore/bb943f99-67ab-4e39-aad5-d0eadd0e4934 @@ -0,0 +1 @@ +}, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST # Use a temporary file to hold the base64 encoded image data TEMP_B64 = $( mktemp ) trap 'rm -f "$TEMP_B64"' EXIT base64 $B64FLAGS $IMG_PATH > " $TEMP_B64 " # Use a temporary file to hold the JSON payload TEMP_JSON = $( mktemp ) trap 'rm -f "$TEMP_JSON"' EXIT cat > " $TEMP_JSON " << EOF { "contents" : [ { "parts" : [ { "text" : "Tell me about this instrument" } , { "inline_data" : { "mime_type" : "image/jpeg" , "data" : " $( cat " $TEMP_B64 " ) " } } ] } ] } EOF curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d "@ $TEMP_JSON " Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const imageUrl = 'http://image/url' ; const image = getImageData ( imageUrl ); const payload = { contents : [ { parts : [ { image }, { text : 'Tell me about this instrument' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } function getImageData ( url ) { const blob = UrlFetchApp . fetch ( url ). getBlob (); return { mimeType : blob . getContentType (), data : Utilities . base64Encode ( blob . getBytes ()) }; } \ No newline at end of file diff --git a/docstore/bb97b9bd-1605-4939-b75c-1db0e2783ea2 b/docstore/bb97b9bd-1605-4939-b75c-1db0e2783ea2 new file mode 100644 index 0000000000000000000000000000000000000000..54ee11bfb756db29fb776eb5a6d4247407dfa205 --- /dev/null +++ b/docstore/bb97b9bd-1605-4939-b75c-1db0e2783ea2 @@ -0,0 +1 @@ +(Default/Some Thinking): Many common requests benefit from a degree of step-by-step processing or deeper understanding. Gemini can flexibly use thinking capability for tasks like: Analogize photosynthesis and growing up. Compare and contrast electric cars and hybrid cars. Hard Tasks (Maximum Thinking Capability): For truly complex challenges, such as solving complex math problems or coding tasks, we recommend setting a high thinking budget. These types of tasks require the model to engage its full reasoning and planning capabilities, often involving many internal steps before providing an answer. Examples include: Solve problem 1 in AIME 2025: Find the sum of all integer bases b > 9 for which 17 b is a divisor of 97 b . Write Python code for a web application that visualizes real-time stock market data, including user authentication. Make it as efficient as possible. Thinking with tools and capabilities Thinking models work with all of Gemini's tools and capabilities. This allows the models to interact with external systems, execute code, or access real-time information, incorporating the results into their reasoning and final response. The search tool allows the model to query Google Search to find up-to-date information or information beyond its training data. This is useful for questions about recent events or highly specific topics. The code execution tool enables the model to generate and run Python code to perform calculations, manipulate data, or solve problems that are best handled algorithmically. The model receives the code's output and can use it in its response. With structured output , you can constrain Gemini to respond with JSON. This is particularly useful for integrating the model's output into applications. Function calling connects the thinking model to external tools and APIs, so it can reason about when to call the right function and what parameters to provide. URL Context provides the model with URLs as additional context for your prompt. The \ No newline at end of file diff --git a/docstore/bbb47123-8ad3-44f3-a5d2-c849f9168f0c b/docstore/bbb47123-8ad3-44f3-a5d2-c849f9168f0c new file mode 100644 index 0000000000000000000000000000000000000000..933572d214295ba3502f9da18dc8703a97691ed7 --- /dev/null +++ b/docstore/bbb47123-8ad3-44f3-a5d2-c849f9168f0c @@ -0,0 +1 @@ +captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). * Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds, and their general characteristics. * Thesis Statement: State the main point of the essay, focusing on the remarkable adaptations, behaviors, and ecological significance that make hummingbirds extraordinary. ..... (gemini-2.5-flash) Zero-shot vs few-shot prompts You can include examples in the prompt that show the model what getting it right looks like. The model attempts to identify patterns and relationships from the examples and applies them when generating a response. Prompts that contain a few examples are called few-shot prompts, while prompts that provide no examples are called zero-shot prompts. Few-shot prompts are often used to regulate the formatting, phrasing, scoping, or general patterning of model responses. Use specific and varied examples to help the model narrow its focus and generate more accurate results. We recommend to always include few-shot examples in your prompts. Prompts without few-shot examples are likely to be less effective. In fact, you can remove instructions from your prompt if your examples are clear enough in showing the task at hand. The following zero-shot prompt asks the model to choose the best explanation. Prompt: Please choose the best explanation to the question: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Explanation1 is the better explanation because it provides more detail on the process, including how ice crystals combine and grow into snowflakes as they fall through the atmosphere. (gemini-2.5-flash) If your use case requires the \ No newline at end of file diff --git a/docstore/bc014eca-540e-4179-8a67-65d3d4654854 b/docstore/bc014eca-540e-4179-8a67-65d3d4654854 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/bc014eca-540e-4179-8a67-65d3d4654854 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/bc08b299-2a7c-4f09-b513-f3e2894d77ec b/docstore/bc08b299-2a7c-4f09-b513-f3e2894d77ec new file mode 100644 index 0000000000000000000000000000000000000000..0d39ec227c9e73c3d3b5117917b80f593b500b4b --- /dev/null +++ b/docstore/bc08b299-2a7c-4f09-b513-f3e2894d77ec @@ -0,0 +1 @@ +async function main () { const ai = new GoogleGenAI ({}); const imageUrl = "https://goo.gle/instrument-img" ; const response = await fetch ( imageUrl ); const imageArrayBuffer = await response . arrayBuffer (); const base64ImageData = Buffer . from ( imageArrayBuffer ). toString ( 'base64' ); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ { inlineData : { mimeType : 'image/jpeg' , data : base64ImageData , }, }, { text : "Caption this image." } ], }); console . log ( result . text ); } main (); Go package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Download the image. imageResp , _ := http . Get ( "https://goo.gle/instrument-img" ) imageBytes , _ := io . ReadAll ( imageResp . Body ) parts := [] * genai . Part { genai . NewPartFromBytes ( imageBytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMG_URL = "https://goo.gle/instrument-img" MIME_TYPE = $( curl -sIL " $IMG_URL " | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1 ) if [[ -z " $MIME_TYPE " || ! " $MIME_TYPE " == image/* ]] ; then MIME_TYPE = "image/jpeg" fi # Check for macOS if [[ " $( uname ) " == "Darwin" ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -b 0 ) elif [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 ) else IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -w0 ) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ \ No newline at end of file diff --git a/docstore/bc40c070-82d9-4e4a-ad1f-740ddc72a95a b/docstore/bc40c070-82d9-4e4a-ad1f-740ddc72a95a new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/bc40c070-82d9-4e4a-ad1f-740ddc72a95a @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/bc4a6f53-4bee-407c-960f-fbd05207a86f b/docstore/bc4a6f53-4bee-407c-960f-fbd05207a86f new file mode 100644 index 0000000000000000000000000000000000000000..eb9021b0c62d8225572ffa88a8581263b0a8ad78 --- /dev/null +++ b/docstore/bc4a6f53-4bee-407c-960f-fbd05207a86f @@ -0,0 +1 @@ +and their capabilities, visit the Models page. Best practices Prompting tips For basic text generation, a zero-shot prompt often suffices without needing examples, system instructions or specific formatting. For more tailored outputs: Use System instructions to guide the model. Provide few example inputs and outputs to guide the model. This is often referred to as few-shot prompting. Consult our prompt engineering guide for more tips. Structured output In some cases, you may need structured output, such as JSON. Refer to our structured output guide to learn how. What's next Try the Gemini API getting started Colab . Explore Gemini's image , video , audio and document understanding capabilities. Learn about multimodal file prompting strategies . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/bc4a74d2-dfbe-47cf-9b08-22d0fba68d4f b/docstore/bc4a74d2-dfbe-47cf-9b08-22d0fba68d4f new file mode 100644 index 0000000000000000000000000000000000000000..17bfa7cb7ce514bf0ade86c26f1bff30fbb20a2e --- /dev/null +++ b/docstore/bc4a74d2-dfbe-47cf-9b08-22d0fba68d4f @@ -0,0 +1 @@ +model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Describe this audio clip" , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } localAudioPath := "/path/to/sample.mp3" uploadedFile , _ := client . Files . UploadFromPath ( ctx , localAudioPath , nil , ) parts := [] * genai . Part { genai . NewPartFromText ( "Describe this audio clip" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST AUDIO_PATH = "path/to/sample.mp3" MIME_TYPE = $( file -b --mime-type " ${ AUDIO_PATH } " ) NUM_BYTES = $( wc -c < " ${ AUDIO_PATH } " ) DISPLAY_NAME = AUDIO tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ \ No newline at end of file diff --git a/docstore/bca2baf2-1d2c-424d-a3ef-2278d52332a5 b/docstore/bca2baf2-1d2c-424d-a3ef-2278d52332a5 new file mode 100644 index 0000000000000000000000000000000000000000..5517c0bf6b0252fb1080acbdd22b2b409d2663d8 --- /dev/null +++ b/docstore/bca2baf2-1d2c-424d-a3ef-2278d52332a5 @@ -0,0 +1 @@ +signatures, function call and result of the function execution to contents function_call_content = response . candidates [ 0 ] . content # Append the model's function call message, which includes thought signatures contents . append ( function_call_content ) contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Step 4: Create user friendly response with function result and call the model again // ...Create a function response part (No change) // Append thought signatures, function call and result of the function execution to contents const function_response_content = response . candidates [ 0 ]. content ; contents . push ( function_response_content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); The following shows what a request returning a thought signature may look like: [{ "contents" : [ { "role" : "user" , "parts" : [ { "text" : "what is the weather in Lake Tahoe?" } ] } , { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/bcbaca71-5a54-4089-9ac4-f943fd2a0ae0 b/docstore/bcbaca71-5a54-4089-9ac4-f943fd2a0ae0 new file mode 100644 index 0000000000000000000000000000000000000000..45d17d6ac3f5b7951c085e76c40e76fbe5fe62ea --- /dev/null +++ b/docstore/bcbaca71-5a54-4089-9ac4-f943fd2a0ae0 @@ -0,0 +1 @@ +"thinkingConfig": { "thinkingBudget": 1024 # Thinking off: # "thinkingBudget": 0 # Turn on dynamic thinking: # "thinkingBudget": -1 } } }' Thought summaries Thought summaries are synthesized versions of the model's raw thoughts and offer insights into the model's internal reasoning process. Note that thinking budgets apply to the model's raw thoughts and not to thought summaries. You can enable thought summaries by setting includeThoughts to true in your request configuration. You can then access the summary by iterating through the response parameter's parts , and checking the thought boolean. Here's an example demonstrating how to enable and retrieve thought summaries without streaming, which returns a single, final thought summary with the response: Python from google import genai from google.genai import types client = genai . Client () prompt = "What is the sum of the first 50 prime numbers?" response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = prompt , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( include_thoughts = True ) ) ) for part in response . candidates [ 0 ] . content . parts : if not part . text : continue if part . thought : print ( "Thought summary:" ) print ( part . text ) print () else : print ( "Answer:" ) print ( part . text ) print () JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : "What is the sum of the first 50 prime numbers?" , config : { thinkingConfig : { includeThoughts : true , }, }, }); for ( const part of response . candidates [ 0 ]. content . parts ) { if ( ! part . text ) { continue ; } else if ( part . thought ) { console . log ( "Thoughts summary:" ); console . log ( part . text ); } else { console . log ( "Answer:" ); console . log ( part . text ); } } } main (); Go package main import ( "context" \ No newline at end of file diff --git a/docstore/bcbe36dd-2ee1-4a05-a5a6-a20afdd152a5 b/docstore/bcbe36dd-2ee1-4a05-a5a6-a20afdd152a5 new file mode 100644 index 0000000000000000000000000000000000000000..1b38806cd93f51329872622740eee5221d7a7d7e --- /dev/null +++ b/docstore/bcbe36dd-2ee1-4a05-a5a6-a20afdd152a5 @@ -0,0 +1 @@ +18°C." , config = config , ) # Print the final, user-facing response print ( response . text ) Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . I 've set the thermostat to 20°C. JavaScript This example shows how to use JavaScript/TypeScript SDK to do comopositional function calling using a manual execution loop. import { GoogleGenAI , Type } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Example Functions function get_weather_forecast ({ location }) { console . log ( `Tool Call: get_weather_forecast(location= ${ location } )` ); // TODO: Make API call console . log ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ); return { temperature : 25 , unit : "celsius" }; } function set_thermostat_temperature ({ temperature }) { console . log ( `Tool Call: set_thermostat_temperature(temperature= ${ temperature } )` , ); // TODO: Make API call console . log ( "Tool Response: {'status': 'success'}" ); return { status : "success" }; } const toolFunctions = { get_weather_forecast , set_thermostat_temperature , }; const tools = [ { functionDeclarations : [ { name : "get_weather_forecast" , description : "Gets the current weather temperature for a given location." , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , }, }, required : [ "location" ], }, }, { name : "set_thermostat_temperature" , description : "Sets the thermostat to a desired temperature." , parameters : { type : Type . OBJECT , properties : { temperature : { type : Type . NUMBER , }, }, required : [ \ No newline at end of file diff --git a/docstore/bcd0237e-31af-4f57-948a-5897f466e6ce b/docstore/bcd0237e-31af-4f57-948a-5897f466e6ce new file mode 100644 index 0000000000000000000000000000000000000000..68dfcf53eb693dba8358b7fdf6b0010fadcbc966 --- /dev/null +++ b/docstore/bcd0237e-31af-4f57-948a-5897f466e6ce @@ -0,0 +1 @@ +. files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const countTokensResponse = await ai . models . countTokens ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), ]), }); console . log ( countTokensResponse . totalTokens ); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } localAudioPath := "/path/to/sample.mp3" uploadedFile , _ := client . Files . UploadFromPath ( ctx , localAudioPath , nil , ) parts := [] * genai . Part { genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } tokens , _ := client . Models . CountTokens ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Printf ( "File %s is %d tokens\n" , localAudioPath , tokens . TotalTokens ) } Supported audio formats Gemini supports the following audio format MIME types: WAV - audio/wav MP3 - audio/mp3 AIFF - audio/aiff AAC - audio/aac OGG Vorbis - audio/ogg FLAC - audio/flac Technical details about audio Gemini represents each second of audio as 32 tokens; for example, one minute of audio is represented as 1,920 tokens. Gemini can "understand" non-speech components, such as birdsong or sirens. The maximum supported length of audio data in a single prompt is 9.5 hours. Gemini doesn't limit the number of audio files in a single prompt; however, the total combined length of all audio files in a single prompt can't exceed 9.5 hours. Gemini downsamples audio files to a 16 Kbps data resolution. If the audio source contains multiple channels, Gemini combines those channels into a single channel. What's next This guide shows how to generate text in response to audio data. To learn more, see the following resources: File prompting strategies : \ No newline at end of file diff --git a/docstore/bcde7ca8-760b-48ad-ae44-0ec981dcee1d b/docstore/bcde7ca8-760b-48ad-ae44-0ec981dcee1d new file mode 100644 index 0000000000000000000000000000000000000000..109bd0eaeef926ee02f7659dee842724a4ad9423 --- /dev/null +++ b/docstore/bcde7ca8-760b-48ad-ae44-0ec981dcee1d @@ -0,0 +1 @@ +the libraries are stable and fully supported for production use. They are actively maintained, provide access to the latest features, and offer the best performance working with Gemini. If you're not using the Google GenAI SDK and using one of our legacy libraries, we strongly recommend you to migrate. Review the legacy libraries section for more information. Legacy libraries and migration If you are using one of our legacy libraries, we recommend that you migrate to the new libraries . The legacy libraries don't provide access to recent features (such as Live API and Veo ) and are on a deprecation path. They will stop receiving updates at the end of September 2025, the feature gaps will grow and potential bugs may no longer get fixed. Each legacy library's support status varies, detailed in the following table: Language Legacy library Support status Recommended library Python google-generativeai All support, including bug fixes, ends end of September 2025. google-genai JavaScript/TypeScript @google/generativeai All support, including bug fixes, ends end of September 2025. @google/genai Go google.golang.org/generative-ai All support, including bug fixes, ends end of September 2025. google.golang.org/genai Dart and Flutter google_generative_ai Not actively maintained Use trusted community or third party libraries, like firebase_ai , or access using REST API Swift generative-ai-swift Not actively maintained Use Gemini in Firebase Android generative-ai-android Not actively maintained Use Gemini in Firebase Note for Java developers: There was no legacy Google-provided Java SDK for the Gemini API, so no migration from a previous Google library is required. You can start directly with the new library in the Language support and installation section. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google \ No newline at end of file diff --git a/docstore/bcec232a-df3d-4f50-8986-01aa481ffe8b b/docstore/bcec232a-df3d-4f50-8986-01aa481ffe8b new file mode 100644 index 0000000000000000000000000000000000000000..954b7c2f84cbdb4650dd5497451af99dc34fb5db --- /dev/null +++ b/docstore/bcec232a-df3d-4f50-8986-01aa481ffe8b @@ -0,0 +1 @@ +GoogleGenAI ({}); let response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and make sure you get all 50." , ], config : { tools : [{ codeExecution : {} }], }, }); const parts = response ? . candidates ? .[ 0 ] ? . content ? . parts || []; parts . forEach (( part ) = > { if ( part . text ) { console . log ( part . text ); } if ( part . executableCode && part . executableCode . code ) { console . log ( part . executableCode . code ); } if ( part . codeExecutionResult && part . codeExecutionResult . output ) { console . log ( part . codeExecutionResult . output ); } }); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { Tools : [] * genai . Tool { { CodeExecution : & genai . ToolCodeExecution {}}, }, } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and make sure you get all 50." ), config , ) fmt . Println ( result . Text ()) fmt . Println ( result . ExecutableCode ()) fmt . Println ( result . CodeExecutionResult ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d ' {"tools": [{"code_execution": {}}], "contents": { "parts": { "text": "What is the sum of the first 50 prime numbers? Generate and run code for the calculation, and make sure you get all 50." } }, }' Note: This REST example doesn't parse the JSON response as shown in the example output. The output might look something like the following, which has been formatted for readability: Okay, I need to calculate \ No newline at end of file diff --git a/docstore/bcec571e-0c1d-49ef-8dc5-edffabf1e5b7 b/docstore/bcec571e-0c1d-49ef-8dc5-edffabf1e5b7 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/bcec571e-0c1d-49ef-8dc5-edffabf1e5b7 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/bcfb0a5e-8dbd-416d-a546-bc2779b98ab4 b/docstore/bcfb0a5e-8dbd-416d-a546-bc2779b98ab4 new file mode 100644 index 0000000000000000000000000000000000000000..9b6431ae9c97257d5ff4628dd401f203e2f83eb3 --- /dev/null +++ b/docstore/bcfb0a5e-8dbd-416d-a546-bc2779b98ab4 @@ -0,0 +1 @@ +Use curl to send a POST request to the predictLongRunning endpoint. # The request body includes the prompt for video generation. curl " ${ BASE_URL } /models/veo-2.0-generate-001:predictLongRunning" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -X "POST" \ -d '{ "instances": [{ "prompt": "Panning wide shot of a calico kitten sleeping in the sunshine" } ], "parameters": { "aspectRatio": "16:9", "personGeneration": "dont_allow", } }' | tee result.json | jq .name | sed 's/"//g' > op_name # Obtain operation name to download video. op_name = $( cat op_name ) # Check against status of operation. while true ; do is_done = $( curl -H "x-goog-api-key: $GEMINI_API_KEY " " ${ BASE_URL } / ${ op_name } " | tee op_check.json | jq .done ) if [ " ${ is_done } " = "true" ] ; then cat op_check.json echo "** Attach API_KEY to download video, or examine error message." break fi echo "** Video ${ op_name } has not downloaded yet! Check again after 5 seconds..." # Wait for 5 seoncds to check again. sleep 5 done This code takes about 2-3 minutes to run, though it may take longer if resources are constrained. Once it's done running, you should see a video that looks something like this: If you see an error message instead of a video, this means that resources are constrained and your request couldn't be completed. In this case, run the code again. Generated videos are stored on the server for 2 days, after which they are removed. If you want to save a local copy of your generated video, you must run result() and save() within 2 days of generation. Generate from images You can also generate videos using images. The following code generates an image using Imagen, then uses the generated image as the starting frame for the generated video. First, generate an image using Imagen : Python prompt = "Panning wide shot of a calico kitten sleeping in the sunshine" , imagen = client . models . generate_images ( model = "imagen-3.0-generate-002" , prompt = prompt , \ No newline at end of file diff --git a/docstore/bd35cc1c-fabb-465e-9cb0-145eafb7af07 b/docstore/bd35cc1c-fabb-465e-9cb0-145eafb7af07 new file mode 100644 index 0000000000000000000000000000000000000000..5ec9ece3755d9588328841497099189b5d8c6b8d --- /dev/null +++ b/docstore/bd35cc1c-fabb-465e-9cb0-145eafb7af07 @@ -0,0 +1 @@ +5. Forever Blooms Task Give me a simple list of just the things that I must bring on a camping trip. The list should have 5 items. 1. **Tent:** Your shelter from the elements. 2. **Sleeping Bag:** Stay warm and comfortable. 3. **Sleeping Pad:** Insulation and cushioning for a good night's sleep. 4. **Headlamp or Flashlight:** Light for nighttime activities. 5. **First Aid Kit:** Be prepared for minor injuries. Entity Classify the following items as [large, small]: Elephant Mouse Snail Here's the classification: * **Elephant:** Large * **Mouse:** Small * **Snail:** Small Partial input completion Generative language models work like an advanced auto completion tool. When you provide partial content, the model can provide the rest of the content or what it thinks is a continuation of that content as a response. When doing so, if you include any examples or context, the model can take those examples or context into account. The following example provides a prompt with an instruction and an entity input: Prompt: For the given order, return a JSON object that has the fields cheeseburger, hamburger, fries, or drink, with the value being the quantity. Order: A burger and a drink. Response: { "cheeseburger": 0, "hamburger": 1, "fries": 0, "drink": 1 } (gemini-2.5-flash) While the model did as prompted, writing out the instructions in natural language can sometimes be challenging and it leaves a lot to the model's interpretation. For example, a restaurants menu might contain many items. To reduce the size of the JSON response, you probably want to omit the items that weren't ordered. In this case, you can give an example and a response prefix and let the model complete it: Prompt: Valid fields are cheeseburger, hamburger, fries, and drink. Order: Give me a cheeseburger and fries Output: ``` { "cheeseburger": 1, "fries": 1 } ``` Order: I want two burgers, a drink, and fries. Output: Response: ``` { "hamburger": 2, "drink": 1, "fries": 1 } ``` (gemini-2.5-flash) Notice how \ No newline at end of file diff --git a/docstore/bd39ccef-a789-4f72-9a3b-675e11da677f b/docstore/bd39ccef-a789-4f72-9a3b-675e11da677f new file mode 100644 index 0000000000000000000000000000000000000000..1c3c1b9b46e1c38e34dd8cd82807f79c808d7249 --- /dev/null +++ b/docstore/bd39ccef-a789-4f72-9a3b-675e11da677f @@ -0,0 +1 @@ +sketches, to hyper-realistic digital art. For example, the following images use the same prompt with different styles: "An [art style or creation technique] of an angular sporty electric sedan with skyscrapers in the background" Prompt: A technical pencil drawing of an angular... Prompt: A charcoal drawing of an angular... Prompt: A color pencil drawing of an angular... Prompt: A pastel painting of an angular... Prompt: A digital art of an angular... Prompt: An art deco (poster) of an angular... Image source: Each image was generated using its corresponding text prompt with the Imagen 2 model. Shapes and materials Prompt includes: "...made of..." , "...in the shape of..." One of the strengths of this technology is that you can create imagery that is otherwise difficult or impossible. For example, you can recreate your company logo in different materials and textures. Prompt: a duffle bag made of cheese Prompt: neon tubes in the shape of a bird Prompt: an armchair made of paper , studio photo, origami style Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Historical art references Prompt includes: "...in the style of..." Certain styles have become iconic over the years. The following are some ideas of historical painting or art styles that you can try. "generate an image in the style of [art period or movement] : a wind farm" Prompt: generate an image in the style of an impressionist painting : a wind farm Prompt: generate an image in the style of a renaissance painting : a wind farm Prompt: generate an image in the style of pop art : a wind farm Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Image quality modifiers Certain keywords can let the model know that you're looking for a high-quality asset. Examples of quality modifiers include the following: General Modifiers - high-quality, beautiful, stylized Photos - 4K, HDR, Studio Photo Art, Illustration - by a \ No newline at end of file diff --git a/docstore/bd4864e1-e641-408c-a4ee-05cdfa66e2f4 b/docstore/bd4864e1-e641-408c-a4ee-05cdfa66e2f4 new file mode 100644 index 0000000000000000000000000000000000000000..e7916db67e8d85a6eb6800021ec1af5f73a400b3 --- /dev/null +++ b/docstore/bd4864e1-e641-408c-a4ee-05cdfa66e2f4 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models#token-size Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/bd6f8bdb-8fed-4711-9f45-0abb742c0efd b/docstore/bd6f8bdb-8fed-4711-9f45-0abb742c0efd new file mode 100644 index 0000000000000000000000000000000000000000..84742c0b7906ca5a168857eb7577a7e191bcdffb --- /dev/null +++ b/docstore/bd6f8bdb-8fed-4711-9f45-0abb742c0efd @@ -0,0 +1 @@ +prompt - urban background, man-made structures, dark, stormy, or threatening atmosphere. Aspect ratios Gemini Veo video generation supports the following two aspect ratios: Aspect ratio Description Widescreen or 16:9 The most common aspect ratio for televisions, monitors, and mobile phone screens (landscape). Use this when you want to capture more of the background, like in scenic landscapes. Portrait or 9:16 Rotated widescreen. This aspect ratio has been popularized by short form video applications, such as Youtube shorts. Use this for portraits or tall objects with strong vertical orientations, such as buildings, trees, waterfall, or buildings. Widescreen This prompt is an example of the widescreen aspect ratio of 16:9. Prompt Generated output Create a video with a tracking drone view of a man driving a red convertible car in Palm Springs, 1970s, warm sunlight, long shadows. Portrait This prompt is an example of the portrait aspect ratio of 9:16. Prompt Generated output Create a video highlighting the smooth motion of a majestic Hawaiian waterfall within a lush rainforest. Focus on realistic water flow, detailed foliage, and natural lighting to convey tranquility. Capture the rushing water, misty atmosphere, and dappled sunlight filtering through the dense canopy. Use smooth, cinematic camera movements to showcase the waterfall and its surroundings. Aim for a peaceful, realistic tone, transporting the viewer to the serene beauty of the Hawaiian rainforest. What's next Gain more experience generating AI videos with the Veo Colab . Check out cool examples using Veo 2 on the Google DeepMind site Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/bd78c22c-7594-4792-add0-5eece9d6ceec b/docstore/bd78c22c-7594-4792-add0-5eece9d6ceec new file mode 100644 index 0000000000000000000000000000000000000000..acef50f8402c486c348013efe146de15b88cb32b --- /dev/null +++ b/docstore/bd78c22c-7594-4792-add0-5eece9d6ceec @@ -0,0 +1 @@ +Function calling with the Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Function calling with the Gemini API Function calling lets you connect models to external tools and APIs. Instead of generating text responses, the model determines when to call specific functions and provides the necessary parameters to execute real-world actions. This allows the model to act as a bridge between natural language and real-world actions and data. Function calling has 3 primary use cases: Augment Knowledge: Access information from external sources like databases, APIs, and knowledge bases. Extend Capabilities: Use external tools to perform computations and extend the limitations of the model, such as using a calculator or creating charts. Take Actions: Interact with external systems using APIs, such as scheduling appointments, creating invoices, sending emails, or controlling smart home devices. Get Weather Schedule Meeting Create Chart How function calling works Function calling involves a structured interaction between your application, the model, and external functions. Here's a breakdown of the process: Define Function Declaration: Define the function declaration in your application code. Function Declarations describe the function's name, parameters, and purpose to the model. Call LLM with function declarations: Send user prompt along with the function declaration(s) to the model. It analyzes the request and determines if a function call would be helpful. If so, it responds with a structured JSON object. Execute Function Code (Your Responsibility): The Model does not execute the function itself. It's your application's responsibility to \ No newline at end of file diff --git a/docstore/bd89eaff-a050-4516-a6b0-ca0158def308 b/docstore/bd89eaff-a050-4516-a6b0-ca0158def308 new file mode 100644 index 0000000000000000000000000000000000000000..4ce25c45c0956235a2a76b8ce578fbaaad6010c8 --- /dev/null +++ b/docstore/bd89eaff-a050-4516-a6b0-ca0158def308 @@ -0,0 +1 @@ +clip" , ]), }); console . log ( response . text ); } await main (); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } defer client . DeleteFile ( ctx , file . Name ) model := client . GenerativeModel ( "gemini-2.0-flash" ) resp , err := model . GenerateContent ( ctx , genai . FileData { URI : file . URI }, genai . Text ( "Describe this audio clip" )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) REST AUDIO_PATH = "path/to/sample.mp3" MIME_TYPE = $( file -b --mime-type " ${ AUDIO_PATH } " ) NUM_BYTES = $( wc -c < " ${ AUDIO_PATH } " ) DISPLAY_NAME = AUDIO tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D " ${ tmp_header_file } " \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ AUDIO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Describe this audio clip"}, {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": \ No newline at end of file diff --git a/docstore/bd8aa976-06eb-4e29-886e-2e6812613cf6 b/docstore/bd8aa976-06eb-4e29-886e-2e6812613cf6 new file mode 100644 index 0000000000000000000000000000000000000000..41dedb01cb0b9c984f39578d0001dc7776e6fe12 --- /dev/null +++ b/docstore/bd8aa976-06eb-4e29-886e-2e6812613cf6 @@ -0,0 +1 @@ +, 'IMAGE' ] ) ) for part in response . candidates [ 0 ] . content . parts : if part . text is not None : print ( part . text ) elif part . inline_data is not None : image = Image . open ( BytesIO (( part . inline_data . data ))) image . show () JavaScript Note: We've released the Google SDK for TypeScript and JavaScript in preview launch stage . Use this SDK for image generation features. import { GoogleGenAI , Modality } from "@google/genai" ; import * as fs from "node:fs" ; async function main () { const ai = new GoogleGenAI ({}); // Load the image from the local file system const imagePath = "path/to/image.png" ; const imageData = fs . readFileSync ( imagePath ); const base64Image = imageData . toString ( "base64" ); // Prepare the content parts const contents = [ { text : "Can you add a llama next to the image?" }, { inlineData : { mimeType : "image/png" , data : base64Image , }, }, ]; // Set responseModalities to include "Image" so the model can generate an image const response = await ai . models . generateContent ({ model : "gemini-2.0-flash-preview-image-generation" , contents : contents , config : { responseModalities : [ Modality . TEXT , Modality . IMAGE ], }, }); for ( const part of response . candidates [ 0 ]. content . parts ) { // Based on the part type, either show the text or save the image if ( part . text ) { console . log ( part . text ); } else if ( part . inlineData ) { const imageData = part . inlineData . data ; const buffer = Buffer . from ( imageData , "base64" ); fs . writeFileSync ( "gemini-native-image.png" , buffer ); console . log ( "Image saved as gemini-native-image.png" ); } } } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imagePath := "/path/to/image.png" imgData , _ := os . ReadFile ( imagePath ) parts := [] * genai . Part { genai . NewPartFromText ( "Hi, This is \ No newline at end of file diff --git a/docstore/bd9d8bdc-f4e4-43dd-a5c5-c884c199d6ce b/docstore/bd9d8bdc-f4e4-43dd-a5c5-c884c199d6ce new file mode 100644 index 0000000000000000000000000000000000000000..6e71e94222e9c44768c28e09ebada72b5ff1e76f --- /dev/null +++ b/docstore/bd9d8bdc-f4e4-43dd-a5c5-c884c199d6ce @@ -0,0 +1 @@ +writeFileSync ( `imagen- ${ idx } .png` , buffer ); idx ++ ; } } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateImagesConfig { NumberOfImages : 4 , } response , _ := client . Models . GenerateImages ( ctx , "imagen-4.0-generate-preview-06-06" , "Robot holding a red skateboard" , config , ) for n , image := range response . GeneratedImages { fname := fmt . Sprintf ( "imagen-%d.png" , n ) _ = os . WriteFile ( fname , image . Image . ImageBytes , 0644 ) } } REST curl -X POST \ "https://generativelanguage.googleapis.com/v1beta/models/imagen-4.0-generate-preview-06-06:predict" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -d '{ "instances": [ { "prompt": "Robot holding a red skateboard" } ], "parameters": { "sampleCount": 4 } }' AI-generated image of a robot holding a red skateboard Imagen configuration Imagen supports English only prompts at this time and the following parameters: Note: Naming conventions of parameters vary by programming language. numberOfImages : The number of images to generate, from 1 to 4 (inclusive). The default is 4. For Imagen 4 Ultra, it defaults to 1 as only one image can be generated at a time. aspectRatio : Changes the aspect ratio of the generated image. Supported values are "1:1" , "3:4" , "4:3" , "9:16" , and "16:9" . The default is "1:1" . personGeneration : Allow the model to generate images of people. The following values are supported: "dont_allow" : Block generation of images of people. "allow_adult" : Generate images of adults, but not children. This is the default. "allow_all" : Generate images that include adults and children. Note: The "allow_all" parameter value is not allowed in EU, UK, CH, MENA locations. Choosing the right model Choose Gemini when: You need contextually relevant images that leverage \ No newline at end of file diff --git a/docstore/bdacce44-56de-44e2-be00-1fc454bc42d3 b/docstore/bdacce44-56de-44e2-be00-1fc454bc42d3 new file mode 100644 index 0000000000000000000000000000000000000000..58a1a958a678d222ca4e5b83565d3af6116c766e --- /dev/null +++ b/docstore/bdacce44-56de-44e2-be00-1fc454bc42d3 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/safety-settings#safety-filtering-per-request Title: Safety settings | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/bdb6fe34-c147-4a73-92f5-2ce56707d32f b/docstore/bdb6fe34-c147-4a73-92f5-2ce56707d32f new file mode 100644 index 0000000000000000000000000000000000000000..0d6f2bf32b918d3f85636fdb1c53070e3d5509f6 --- /dev/null +++ b/docstore/bdb6fe34-c147-4a73-92f5-2ce56707d32f @@ -0,0 +1 @@ +overlay_filename = f " { item [ 'label' ] } _ { i } _overlay.png" mask . save ( os . path . join ( output_dir , mask_filename )) # Create and save overlay composite = Image . alpha_composite ( im . convert ( 'RGBA' ), overlay ) composite . save ( os . path . join ( output_dir , overlay_filename )) print ( f "Saved mask and overlay for { item [ 'label' ] } to { output_dir } " ) # Example usage if __name__ == "__main__" : extract_segmentation_masks ( "path/to/image.png" ) Check the segmentation example in the cookbook guide for a more detailed example. An example segmentation output with objects and segmentation masks Supported image formats Gemini supports the following image format MIME types: PNG - image/png JPEG - image/jpeg WEBP - image/webp HEIC - image/heic HEIF - image/heif Capabilities All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation. Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements. Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities: Gemini 2.0 models are further trained to support enhanced object detection . Gemini 2.5 models are further trained to support enhanced segmentation in addition to object detection . Limitations and key technical information File limit Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request. Token calculation Gemini 1.5 Flash and Gemini 1.5 Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens. Gemini 2.0 Flash and Gemini 2.5 Flash/Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each \ No newline at end of file diff --git a/docstore/bdde20d6-4a72-44be-b531-2040bb72b221 b/docstore/bdde20d6-4a72-44be-b531-2040bb72b221 new file mode 100644 index 0000000000000000000000000000000000000000..6adba52776c85866a9f7e3e3060c0512ed2f447c --- /dev/null +++ b/docstore/bdde20d6-4a72-44be-b531-2040bb72b221 @@ -0,0 +1 @@ +about image prompting, see the Imagen prompt guide To learn about video prompting, see the Veo prompt guide Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-04-28 UTC. \ No newline at end of file diff --git a/docstore/be025e0c-278f-4af7-b6c9-647c0a9816dd b/docstore/be025e0c-278f-4af7-b6c9-647c0a9816dd new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/be025e0c-278f-4af7-b6c9-647c0a9816dd @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/be083f58-527a-4a2f-a9e9-76aa2e94ed2e b/docstore/be083f58-527a-4a2f-a9e9-76aa2e94ed2e new file mode 100644 index 0000000000000000000000000000000000000000..b76b6c3627992382bef4112623cdd2870605dd8e --- /dev/null +++ b/docstore/be083f58-527a-4a2f-a9e9-76aa2e94ed2e @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/image-understanding#inline-image Title: Image understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/be08d8e5-89e8-4278-b526-d1fd9e8ab202 b/docstore/be08d8e5-89e8-4278-b526-d1fd9e8ab202 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/be08d8e5-89e8-4278-b526-d1fd9e8ab202 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/be09bee1-35ad-40c4-a829-efb1310faaf8 b/docstore/be09bee1-35ad-40c4-a829-efb1310faaf8 new file mode 100644 index 0000000000000000000000000000000000000000..c95ce8529f78ed9807c80ac97da2c9c530df9edf --- /dev/null +++ b/docstore/be09bee1-35ad-40c4-a829-efb1310faaf8 @@ -0,0 +1 @@ +GenerateContentConfig { ResponseModalities : [] string { "TEXT" , "IMAGE" }, } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash-preview-image-generation" , genai . Text ( "Hi, can you create a 3d rendered image of a pig " + "with wings and a top hat flying over a happy " + "futuristic scifi city with lots of greenery?" ), config , ) for _ , part := range result . Candidates [ 0 ]. Content . Parts { if part . Text != "" { fmt . Println ( part . Text ) } else if part . InlineData != nil { imageBytes := part . InlineData . Data outputFilename := "gemini_generated_image.png" _ = os . WriteFile ( outputFilename , imageBytes , 0644 ) } } } REST curl -s -X POST "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-preview-image-generation:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts": [ {"text": "Hi, can you create a 3d rendered image of a pig with wings and a top hat flying over a happy futuristic scifi city with lots of greenery?"} ] }], "generationConfig":{"responseModalities":["TEXT","IMAGE"]} }' \ | grep -o '"data": "[^"]*"' \ | cut -d '"' -f4 \ | base64 --decode > gemini-native-image.png AI-generated image of a fantastical flying pig Image editing (text-and-image-to-image) To perform image editing, add an image as input. The following example demonstrates uploading base64 encoded images. For multiple images and larger payloads, check the image input section. Python from google import genai from google.genai import types from PIL import Image from io import BytesIO import PIL.Image image = PIL . Image . open ( '/path/to/image.png' ) client = genai . Client () text_input = ( 'Hi, This is a picture of me.' 'Can you add a llama next to me?' ,) response = client . models . generate_content ( model = "gemini-2.0-flash-preview-image-generation" , contents = [ text_input , image ], config = types . GenerateContentConfig ( response_modalities = [ 'TEXT' \ No newline at end of file diff --git a/docstore/be225969-646f-433b-a1a0-def4fd55497c b/docstore/be225969-646f-433b-a1a0-def4fd55497c new file mode 100644 index 0000000000000000000000000000000000000000..c6b31d98c2b16f02ca76b6157fd854c0b53727af --- /dev/null +++ b/docstore/be225969-646f-433b-a1a0-def4fd55497c @@ -0,0 +1 @@ +: [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Thinking with Gemini 2.5 2.5 Flash and Pro models have "thinking" enabled by default to enhance quality, which may take longer to run and increase token usage. When using 2.5 Flash, you can disable thinking by setting the thinking budget to zero. For more details, see the thinking guide . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # Disables thinking ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , config : { thinkingConfig : { thinkingBudget : 0 , // Disables thinking }, } }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "How does AI work?" ), & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : int32 ( 0 ), // Disables thinking }, } ) \ No newline at end of file diff --git a/docstore/be259828-8487-424c-8bf4-a24b90f2e1a7 b/docstore/be259828-8487-424c-8bf4-a24b90f2e1a7 new file mode 100644 index 0000000000000000000000000000000000000000..38b0d511edf99b6e46520aefe0a8854155d4053f --- /dev/null +++ b/docstore/be259828-8487-424c-8bf4-a24b90f2e1a7 @@ -0,0 +1 @@ +energetic music, and dimmed the lights to 50% brightness. Let's get this party started! Compositional function calling Compositional or sequential function calling allows Gemini to chain multiple function calls together to fulfill a complex request. For example, to answer "Get the temperature in my current location", the Gemini API might first invoke a get_current_location() function followed by a get_weather() function that takes the location as a parameter. The following example demonstrates how to implement compositional function calling using the Python SDK and automatic function calling. Python This example uses the automatic function calling feature of the google-genai Python SDK. The SDK automatically converts the Python functions to the required schema, executes the function calls when requested by the model, and sends the results back to the model to complete the task. import os from google import genai from google.genai import types # Example Functions def get_weather_forecast ( location : str ) - > dict : """Gets the current weather temperature for a given location.""" print ( f "Tool Call: get_weather_forecast(location= { location } )" ) # TODO: Make API call print ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ) return { "temperature" : 25 , "unit" : "celsius" } # Dummy response def set_thermostat_temperature ( temperature : int ) - > dict : """Sets the thermostat to a desired temperature.""" print ( f "Tool Call: set_thermostat_temperature(temperature= { temperature } )" ) # TODO: Interact with a thermostat API print ( "Tool Response: {'status': 'success'}" ) return { "status" : "success" } # Configure the client and model client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_weather_forecast , set_thermostat_temperature ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to \ No newline at end of file diff --git a/docstore/be3fb2e0-192c-4847-a105-102a36f03da7 b/docstore/be3fb2e0-192c-4847-a105-102a36f03da7 new file mode 100644 index 0000000000000000000000000000000000000000..9dd2717ee06a5cb6666e2976e86c3492640fe1f5 --- /dev/null +++ b/docstore/be3fb2e0-192c-4847-a105-102a36f03da7 @@ -0,0 +1 @@ +'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Refer to the GenerateContentConfig in our API reference for a complete list of configurable parameters and their descriptions. Multimodal inputs The Gemini API supports multimodal inputs, allowing you to combine text with media files. The following example demonstrates providing an image: Python from PIL import Image from google import genai client = genai . Client () image = Image . open ( "/path/to/organ.png" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , "Tell me about this instrument" ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const image = await ai . files . upload ({ file : "/path/to/organ.png" , }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ createUserContent ([ "Tell me about this instrument" , createPartFromUri ( image . uri , image . mimeType ), ]), ], }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imagePath := "/path/to/organ.jpg" imgData , _ := os . ReadFile ( imagePath ) parts := [] * genai . Part { genai . NewPartFromText ( "Tell me about this instrument" ), & genai . Part { InlineData : & genai . Blob { MIMEType : "image/jpeg" , Data : imgData , \ No newline at end of file diff --git a/docstore/be5b6ffb-003b-4a9f-be1e-003531c09e8a b/docstore/be5b6ffb-003b-4a9f-be1e-003531c09e8a new file mode 100644 index 0000000000000000000000000000000000000000..b23a8acc5f0d54a573ae6bf2c9ff53a2c6e1da77 --- /dev/null +++ b/docstore/be5b6ffb-003b-4a9f-be1e-003531c09e8a @@ -0,0 +1 @@ +Rate limits | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Rate limits Rate limits regulate the number of requests you can make to the Gemini API within a given timeframe. These limits help maintain fair usage, protect against abuse, and help maintain system performance for all users. How rate limits work Rate limits are usually measured across three dimensions: Requests per minute ( RPM ) Requests per day ( RPD ) Tokens per minute (input) ( TPM ) Your usage is evaluated against each limit, and exceeding any of them will trigger a rate limit error. For example, if your RPM limit is 20, making 21 requests within a minute will result in an error, even if you haven't exceeded your TPM or other limits. Rate limits are applied per project, not per API key. Limits vary depending on the specific model being used, and some limits only apply to specific models. For example, Images per minute, or IPM, is only calculated for models capable of generating images (Imagen 3), but is conceptually similar to TPM. Other models might have a token per day limit (TPD). Rate limits are more restricted for experimental and preview models. Usage tiers Rate limits are tied to the project's usage tier. As your API usage and spending increase, you'll have an option to upgrade to a higher tier with increased rate limits. Tier Qualifications Free Users in eligible countries Tier 1 Billing account linked to the project Tier 2 Total spend: > $250 and at least 30 days since successful payment Tier 3 Total spend: > $1,000 and at least 30 days since successful payment When you request an upgrade, our automated abuse protection system performs additional checks. \ No newline at end of file diff --git a/docstore/be67a571-e93d-4b9c-b4e3-3df0b06681ae b/docstore/be67a571-e93d-4b9c-b4e3-3df0b06681ae new file mode 100644 index 0000000000000000000000000000000000000000..9b0480f81615786d4da1611ac72ab96b5af43602 --- /dev/null +++ b/docstore/be67a571-e93d-4b9c-b4e3-3df0b06681ae @@ -0,0 +1 @@ +"CiIBVKhc7oDPpCaXyJKKssjqr4g3JNOSgJ/M2V+1THC1icsWCmwBVKhc7pBABbZ+zR3e9234WnWWS6GFXmf8IVwpnzjd5KYd7vyJbn/4vTorWBGayj/vbd9JPaZQjxdAIXhoE5mX/MDsQ7M9N/b0qJjHm39tYIBvS4sIWkMDHqTJqXGLzhhKtrTkfbV3RbaJEkQKmwEBVKhc7qVUgC3hfTXZLo9R3AJzUUIx50NKvJTb9B+UU+LBqgg7Nck1x5OpjWVS2R+SsveprIuYOruk2Y0H53J2OJF8qsxTdIq2si8DGW2V7WK8xyoJH5kbqd7drIw1jLb44b6lx4SMyB0VaULuTBki4d+Ljjg1tJTwR0IYMKqDLDZt9mheINsi0ZxcNjfpnDydRXdWbcSwzmK/wgqJAQFUqFzuKgNVElxs3cbO+xebr2IwcOro84nKTisi0tTp9bICPC9fTUhn3L+rvQWA+d3J1Za8at2bakrqiRj7BTh+CVO9fWQMAEQAs3ni0Z2hfaYG92tOD26E4IoZwyYEoWbfNudpH1fr5tEkyqnEGtWIh7H+XoZQ2DXeiOa+br7Zk88SrNE+trJMCogBAVSoXO5e9fBLg7hnbkmKsrzNLnQtLsQm1gNzjcjEC7nJYklYPp0KI2uGBE1PkM8XNsfllAfHVn7LzHcHNlbQ9pJ7QZTSIeG42goS971r5wNZwxaXwCTphClQh826eqJWo6A/28TtAVQWLhTx5ekbP7qb4nh1UblESZ1saxDQAEo4OKPbDzx5BgqKAQFUqFzuVyjNm5i0wN8hTDnKjfpDroEpPPTs531iFy9BOX+xDCdGHy8D+osFpaoBq6TFekQQbz4hIoUR1YEcP4zI80/cNimEeb9IcFxZTTxiNrbhbbcv0969DSMWhB+ZEqIz4vuw4GLe/xcUvqhlChQwFdgIbdOQHSHpatn5uDlktnP/bi26nKuXIwo0AVSoXO7US22OUH7d1f4abNPI0IyAvhqkPp12rbtWLx9vkOtojE8IP+xCfYtIFuZIzRNZqA==" } ] , "role" : "model" } , { "role" : "user" , "parts" : [ { "functionResponse" : { "name" : "getWeather" , "response" : { "response" : { "stringValue" : "Sunny and hot. 90 degrees Fahrenheit" } } } } ] } ] , # Remainder of request... Learn more about limitations and usage of thought signatures, and about thinking models in general, on the Thinking page. Parallel function calling In addition to single turn function calling, you can also call multiple functions at once. Parallel function calling lets you execute multiple functions at once and is used when the functions are not dependent on each other. This is useful in scenarios like gathering data from multiple independent sources, such as retrieving customer details from different databases or checking inventory levels across various warehouses or performing multiple actions such as converting your apartment into a disco. Python power_disco_ball = { "name" : "power_disco_ball" , "description" \ No newline at end of file diff --git a/docstore/be683aac-d22c-4210-8bfc-b663d93eea65 b/docstore/be683aac-d22c-4210-8bfc-b663d93eea65 new file mode 100644 index 0000000000000000000000000000000000000000..593f19ae304227c2d3d55e6879a85836457ba81b --- /dev/null +++ b/docstore/be683aac-d22c-4210-8bfc-b663d93eea65 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/batch-mode#main-content Title: Batch Mode | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/be690809-3e1a-4c86-948f-e6aaff88a737 b/docstore/be690809-3e1a-4c86-948f-e6aaff88a737 new file mode 100644 index 0000000000000000000000000000000000000000..a8bbf59ddedb6c7213df1bd811fe6d2ce8316cfa --- /dev/null +++ b/docstore/be690809-3e1a-4c86-948f-e6aaff88a737 @@ -0,0 +1 @@ +. live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-exp-native-audio-thinking-dialog' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Voice Activity Detection (VAD) Voice Activity Detection (VAD) allows the model to recognize when a person is speaking. This is essential for creating natural conversations, as it allows a user to interrupt the model at any time. When VAD detects an interruption, the ongoing generation is canceled and discarded. Only the information already sent to the client is retained in the session history. The server then sends a BidiGenerateContentServerContent message to report the interruption. The Gemini server then discards any pending function calls and sends a BidiGenerateContentServerContent message with the IDs of the canceled calls. Python async for response in session . receive (): if response . server_content . interrupted is True : # The generation was interrupted # If realtime playback is implemented in your application, # you should stop playing audio and clear queued playback here. JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . interrupted ) { // The generation was interrupted // If realtime playback is implemented in your application, // you should stop playing audio and clear queued playback here. } } Automatic VAD By default, the model automatically performs VAD on a continuous audio input stream. VAD can be configured with the realtimeInputConfig.automaticActivityDetection field of the setup configuration . When the audio stream is paused for more than a second (for example, because the user switched off the microphone), an audioStreamEnd event \ No newline at end of file diff --git a/docstore/be706b38-6c2b-4030-af0b-8004b094717b b/docstore/be706b38-6c2b-4030-af0b-8004b094717b new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/be706b38-6c2b-4030-af0b-8004b094717b @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/be898c04-39c1-4291-8ee5-ef2091905412 b/docstore/be898c04-39c1-4291-8ee5-ef2091905412 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/be898c04-39c1-4291-8ee5-ef2091905412 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/be8fcd1b-da59-4a09-aa61-32beda0a94bf b/docstore/be8fcd1b-da59-4a09-aa61-32beda0a94bf new file mode 100644 index 0000000000000000000000000000000000000000..a99dcaa8b13f3b83504ad1b35c796f8fc297615c --- /dev/null +++ b/docstore/be8fcd1b-da59-4a09-aa61-32beda0a94bf @@ -0,0 +1 @@ +] # Execute the prompt with specified tools in audio modality await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Multiple tasks example - combining lights, code execution, and search const prompt = ` Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! ` ; const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } // not defined here. ]; // Execute the prompt with specified tools in audio modality await run ( prompt , { tools : tools , modality : "AUDIO" }); Python developers can try this out in the Live API Tool Use notebook . Model context protocol (MCP) Model Context Protocol (MCP) is an open standard for connecting AI applications with external tools and data. MCP provides a common protocol for models to access context, such as functions (tools), data sources (resources), or predefined prompts. The Gemini SDKs have built-in support for the MCP, reducing boilerplate code and offering automatic tool calling for MCP tools. When the model generates an MCP tool call, the Python and JavaScript client SDK can automatically execute the MCP tool and send the response back to the model in a subsequent request, continuing this loop until no more tool calls are made by the model. Here, you can find an example of how to use a local MCP server with Gemini and mcp SDK. Python Make sure the latest version of the mcp SDK is installed on your platform of choice. pip install mcp Note: Python supports automatic tool calling by passing in the ClientSession into the tools parameters. If you want to disable it, you can provide automatic_function_calling with disabled True . import os import asyncio from datetime import datetime from mcp import ClientSession , StdioServerParameters from \ No newline at end of file diff --git a/docstore/be9a41ef-283e-44aa-9184-501e8f0b5d66 b/docstore/be9a41ef-283e-44aa-9184-501e8f0b5d66 new file mode 100644 index 0000000000000000000000000000000000000000..b01e3d7381233454c91a082a896e317b3489c857 --- /dev/null +++ b/docstore/be9a41ef-283e-44aa-9184-501e8f0b5d66 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting_with_media?lang=python#specific-instructions Title: Files API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/bec13c95-b908-4050-8091-3b5aeb78dc78 b/docstore/bec13c95-b908-4050-8091-3b5aeb78dc78 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/bec13c95-b908-4050-8091-3b5aeb78dc78 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/bee6c3e4-6d05-4b19-9003-27235787023f b/docstore/bee6c3e4-6d05-4b19-9003-27235787023f new file mode 100644 index 0000000000000000000000000000000000000000..b99824a0bb181cb1be6367ec11bfeefdd4ec4b3d --- /dev/null +++ b/docstore/bee6c3e4-6d05-4b19-9003-27235787023f @@ -0,0 +1 @@ +AUDIO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Describe this audio clip"}, {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": ' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json To learn more about working with media files, see Files API . Pass audio data inline Instead of uploading an audio file, you can pass inline audio data in the request to generateContent : Python from google.genai import types with open ( 'path/to/small-sample.mp3' , 'rb' ) as f : audio_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ 'Describe this audio clip' , types . Part . from_bytes ( data = audio_bytes , mime_type = 'audio/mp3' , ) ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const base64AudioFile = fs . readFileSync ( "path/to/small-sample.mp3" , { encoding : "base64" , }); const contents = [ { text : "Please summarize the audio." }, { inlineData : { mimeType : "audio/mp3" , data : base64AudioFile , }, }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } audioBytes , _ := os . ReadFile ( "/path/to/small-sample.mp3" ) parts := [] * genai . Part { genai . NewPartFromText ( "Describe this audio clip" ), & genai . \ No newline at end of file diff --git a/docstore/beef44cf-87bf-4f01-a073-dcf5404b42f7 b/docstore/beef44cf-87bf-4f01-a073-dcf5404b42f7 new file mode 100644 index 0000000000000000000000000000000000000000..3d32a6c6f44782138d2600dc9a5e7c5bf75a9a24 --- /dev/null +++ b/docstore/beef44cf-87bf-4f01-a073-dcf5404b42f7 @@ -0,0 +1 @@ +in 3 sentences."}, { "file_data": { "file_uri": "https://www.youtube.com/watch?v=9hE5-98ZeCg" } } ] }] }' 2 > /dev/null Refer to timestamps in the content You can ask questions about specific points in time within the video using timestamps of the form MM:SS . Python prompt = "What are the examples given at 00:05 and 00:10 supposed to show us?" # Adjusted timestamps for the NASA video JavaScript const prompt = "What are the examples given at 00:05 and 00:10 supposed to show us?" ; Go prompt := [] * genai . Part { genai . NewPartFromURI ( currentVideoFile . URI , currentVideoFile . MIMEType ), // Adjusted timestamps for the NASA video genai . NewPartFromText ( "What are the examples given at 00:05 and " + "00:10 supposed to show us?" ), } REST PROMPT = "What are the examples given at 00:05 and 00:10 supposed to show us?" Transcribe video and provide visual descriptions The Gemini models can transcribe and provide visual descriptions of video content by processing both the audio track and visual frames. For visual descriptions, the model samples the video at a rate of 1 frame per second . This sampling rate may affect the level of detail in the descriptions, particularly for videos with rapidly changing visuals. Python prompt = "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions." JavaScript const prompt = "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions." ; Go prompt := [] * genai . Part { genai . NewPartFromURI ( currentVideoFile . URI , currentVideoFile . MIMEType ), genai . NewPartFromText ( "Transcribe the audio from this video, giving timestamps for salient events in the video. Also " + "provide visual descriptions." ), } REST PROMPT = "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions." Customize video processing You can customize video processing \ No newline at end of file diff --git a/docstore/bef08c17-1716-4b0d-8d97-c154ed238df2 b/docstore/bef08c17-1716-4b0d-8d97-c154ed238df2 new file mode 100644 index 0000000000000000000000000000000000000000..1b590529498df147a73d1ae4a22f439d08a36cb6 --- /dev/null +++ b/docstore/bef08c17-1716-4b0d-8d97-c154ed238df2 @@ -0,0 +1 @@ +from "node:fs" ; const ai = new GoogleGenAI ({}); const base64ImageFile = fs . readFileSync ( "path/to/small-sample.jpg" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "image/jpeg" , data : base64ImageFile , }, }, { text : "Caption this image." }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go bytes , _ := os . ReadFile ( "path/to/small-sample.jpg" ) parts := [] * genai . Part { genai . NewPartFromBytes ( bytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST IMG_PATH = "/path/to/your/image1.jpg" if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "' " $( base64 $B64FLAGS $IMG_PATH ) " '" } }, {"text": "Caption this image."}, ] }] }' 2 > /dev/null You can also fetch an image from a URL, convert it to bytes, and pass it to generateContent as shown in the following examples. Python from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests . get ( image_path ) . content image = types . Part . from_bytes ( data = image_bytes , mime_type = "image/jpeg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is this image?" , image ], ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; \ No newline at end of file diff --git a/docstore/befd645b-2ffb-4ffb-bd4a-1bd5476fa6e6 b/docstore/befd645b-2ffb-4ffb-bd4a-1bd5476fa6e6 new file mode 100644 index 0000000000000000000000000000000000000000..c3dd5ccab4f70d7a2a8a4ffb78dc41dee62c1a3f --- /dev/null +++ b/docstore/befd645b-2ffb-4ffb-bd4a-1bd5476fa6e6 @@ -0,0 +1 @@ +with texts to make sense, use whatever order is most natural. Troubleshooting your multimodal prompt If the model is not drawing information from the relevant part of the image: Drop hints with which aspects of the image you want the prompt to draw information from. If the model output is too generic (not tailored enough to the image/video input): At the start of the prompt, try asking the model to describe the image(s) or video before providing the task instruction, or try asking the model to refer to what's in the image. To troubleshoot which part failed: Ask the model to describe the image, or ask the model to explain its reasoning, to gauge the model's initial understanding. If your prompt results in hallucinated content: Try dialing down the temperature setting or asking the model for shorter descriptions so that it's less likely to extrapolate additional details. Tuning the sampling parameters: Experiment with different temperature settings and top-k selections to adjust the model's creativity. Be specific in your instructions Prompts have the most success when they are clear and detailed. If you have a specific output in mind, it's better to include that requirement in the prompt to ensure you get the output you want. For this image of an airport board, asking the model to just "describe this image" could generate a general description. If you need the model to parse the time and city from the image, you can include that request directly in your prompt. Prompt Model response Describe this image. The image shows an airport arrivals and departures board. Updated prompt Improved response Parse the time and city from the airport board shown in this image into a list. 10:50 Moscow 11:05 Edinburgh 11:05 London 11:10 Bucharest 11:30 Kiev 11:35 Dublin 11:45 East Midlands 12:15 Sofia 12:30 London 12:30 Newcastle 12:40 St Petersburg 12:40 London 12:45 Manchester Add a few examples The Gemini model can accept multiple inputs which it can use as examples to understand \ No newline at end of file diff --git a/docstore/bf0b6bcb-56cb-49ca-b5ba-4b1374e234d4 b/docstore/bf0b6bcb-56cb-49ca-b5ba-4b1374e234d4 new file mode 100644 index 0000000000000000000000000000000000000000..a5c7df71403cc48e8e56352e55ad417999aabca3 --- /dev/null +++ b/docstore/bf0b6bcb-56cb-49ca-b5ba-4b1374e234d4 @@ -0,0 +1 @@ +. getGenerativeModel ({ model : "gemini-1.5-flash" , safetySettings : [ { category : HarmCategory . HARM_CATEGORY_HARASSMENT , threshold : HarmBlockThreshold . BLOCK_LOW_AND_ABOVE , }, ], }); const unsafePrompt = "I support Martians Soccer Club and I think " + "Jupiterians Football Club sucks! Write an ironic phrase telling " + "them how I feel about them." ; const result = await model . generateContent ( unsafePrompt ); try { result . response . text (); } catch ( e ) { console . error ( e ); console . log ( result . response . candidates [ 0 ]. safetyRatings ); } After Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'say something bad' , config = types . GenerateContentConfig ( safety_settings = [ types . SafetySetting ( category = 'HARM_CATEGORY_HATE_SPEECH' , threshold = 'BLOCK_ONLY_HIGH' ), ] ), ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const unsafePrompt = "I support Martians Soccer Club and I think " + "Jupiterians Football Club sucks! Write an ironic phrase telling " + "them how I feel about them." ; const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : unsafePrompt , config : { safetySettings : [ { category : "HARM_CATEGORY_HARASSMENT" , threshold : "BLOCK_ONLY_HIGH" , }, ], }, }); console . log ( "Finish reason:" , response . candidates [ 0 ]. finishReason ); console . log ( "Safety ratings:" , response . candidates [ 0 ]. safetyRatings ); Async Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content_async ( 'tell me a story in 100 words' ) After Python To use the new SDK with asyncio , there is a separate async implementation of every method under client.aio . from google import genai client = genai . Client () response = await \ No newline at end of file diff --git a/docstore/bf0e9d14-256c-464f-99d6-a55478958512 b/docstore/bf0e9d14-256c-464f-99d6-a55478958512 new file mode 100644 index 0000000000000000000000000000000000000000..a3f65441166ab1d39d6f723c5ce4c04b54ac95e6 --- /dev/null +++ b/docstore/bf0e9d14-256c-464f-99d6-a55478958512 @@ -0,0 +1 @@ +values, use an enum. Tool Selection: While the model can use an arbitrary number of tools, providing too many can increase the risk of selecting an incorrect or suboptimal tool. For best results, aim to provide only the relevant tools for the context or task, ideally keeping the active set to a maximum of 10-20. Consider dynamic tool selection based on conversation context if you have a large total number of tools. Prompt Engineering: Provide context: Tell the model its role (e.g., "You are a helpful weather assistant."). Give instructions: Specify how and when to use functions (e.g., "Don't guess dates; always use a future date for forecasts."). Encourage clarification: Instruct the model to ask clarifying questions if needed. Temperature: Use a low temperature (e.g., 0) for more deterministic and reliable function calls. Validation: If a function call has significant consequences (e.g., placing an order), validate the call with the user before executing it. Error Handling : Implement robust error handling in your functions to gracefully handle unexpected inputs or API failures. Return informative error messages that the model can use to generate helpful responses to the user. Security: Be mindful of security when calling external APIs. Use appropriate authentication and authorization mechanisms. Avoid exposing sensitive data in function calls. Token Limits: Function descriptions and parameters count towards your input token limit. If you're hitting token limits, consider limiting the number of functions or the length of the descriptions, break down complex tasks into smaller, more focused function sets. Notes and limitations Only a subset of the OpenAPI schema is supported. Supported parameter types in Python are limited. Automatic function calling is a Python SDK feature only. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License \ No newline at end of file diff --git a/docstore/bf155b09-3639-4ed2-9d78-d8175ad64a9c b/docstore/bf155b09-3639-4ed2-9d78-d8175ad64a9c new file mode 100644 index 0000000000000000000000000000000000000000..f65bfb5d195a3160683160d98bf38afd321eba5f --- /dev/null +++ b/docstore/bf155b09-3639-4ed2-9d78-d8175ad64a9c @@ -0,0 +1 @@ +Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/bf1cf381-5fe7-4df0-9f33-7407f06c0b45 b/docstore/bf1cf381-5fe7-4df0-9f33-7407f06c0b45 new file mode 100644 index 0000000000000000000000000000000000000000..36b0f0f8a4df60acd9dd94249f5fced4282af350 --- /dev/null +++ b/docstore/bf1cf381-5fe7-4df0-9f33-7407f06c0b45 @@ -0,0 +1 @@ +Get started with Live API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Get started with Live API Preview: The Live API is in preview. The Live API enables low-latency, real-time voice and video interactions with Gemini. It processes continuous streams of audio, video, or text to deliver immediate, human-like spoken responses, creating a natural conversational experience for your users. Live API offers a comprehensive set of features such as Voice Activity Detection , tool use and function calling , session management (for managing long running conversations) and ephemeral tokens (for secure client-sided authentication). This page gets you up and running with examples and basic code samples. Example applications Check out the following example applications that illustrate how to use Live API for end-to-end use cases: Live audio starter app on AI Studio, using JavaScript libraries to connect to Live API and stream bidirectional audio through your microphone and speakers. Live API Python cookbook using Pyaudio that connects to Live API. Partner integrations If you prefer a simpler development process, you can use Daily or LiveKit . These are third-party partner platforms that have already integrated the Gemini Live API over the WebRTC protocol to streamline the development of real-time audio and video applications. Before you begin building There are two important decisions to make before you begin building with the Live API: choosing a model and choosing an implementation approach. Choose an audio generation architecture If you're building an audio-based use case, your choice of model determines the audio generation architecture \ No newline at end of file diff --git a/docstore/bf2aafbc-ab10-49ef-b881-e34c0b2e98c6 b/docstore/bf2aafbc-ab10-49ef-b881-e34c0b2e98c6 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/bf2aafbc-ab10-49ef-b881-e34c0b2e98c6 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/bf3684bd-9594-49b0-be9c-efc0044bf9ff b/docstore/bf3684bd-9594-49b0-be9c-efc0044bf9ff new file mode 100644 index 0000000000000000000000000000000000000000..32cd4b13d40c580a7878291a6621896a7de36f97 --- /dev/null +++ b/docstore/bf3684bd-9594-49b0-be9c-efc0044bf9ff @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.5-pro-preview-tts Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/bf45df97-37d0-4ef6-8266-771bcc5e2cf2 b/docstore/bf45df97-37d0-4ef6-8266-771bcc5e2cf2 new file mode 100644 index 0000000000000000000000000000000000000000..150f8758ce4500c63fdc2d62f5bb812ca3b2d976 --- /dev/null +++ b/docstore/bf45df97-37d0-4ef6-8266-771bcc5e2cf2 @@ -0,0 +1 @@ +client-side (browser based) applications // Consider using Ephemeral Tokens instead // More information at: https://ai.google.dev/gemini-api/docs/ephemeral-tokens // Half cascade model: // const model = "gemini-live-2.5-flash-preview" // Native audio output model: const model = "gemini-2.5-flash-preview-native-audio-dialog" const config = { responseModalities : [ Modality . AUDIO ], systemInstruction : "You are a helpful assistant and answer in a friendly tone." }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); \ No newline at end of file diff --git a/docstore/bf6bafe6-46e1-4a4a-be4e-cfcf0dd86809 b/docstore/bf6bafe6-46e1-4a4a-be4e-cfcf0dd86809 new file mode 100644 index 0000000000000000000000000000000000000000..2dd19cc1c2d77863f4a301d62456871308dd74fd --- /dev/null +++ b/docstore/bf6bafe6-46e1-4a4a-be4e-cfcf0dd86809 @@ -0,0 +1 @@ +"temperature" ], }, }, ], }, ]; // Prompt for the model let contents = [ { role : "user" , parts : [ { text : "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , }, ], }, ]; // Loop until the model has no more function calls to make while ( true ) { const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents , config : { tools }, }); if ( result . functionCalls && result . functionCalls . length > 0 ) { const functionCall = result . functionCalls [ 0 ]; const { name , args } = functionCall ; if ( ! toolFunctions [ name ]) { throw new Error ( `Unknown function call: ${ name } ` ); } // Call the function and get the response. const toolResponse = toolFunctions [ name ]( args ); const functionResponsePart = { name : functionCall . name , response : { result : toolResponse , }, }; // Send the function response back to the model. contents . push ({ role : "model" , parts : [ { functionCall : functionCall , }, ], }); contents . push ({ role : "user" , parts : [ { functionResponse : functionResponsePart , }, ], }); } else { // No more function calls, break the loop. console . log ( result . text ); break ; } } Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . It 's 25°C in London, so I' ve set the thermostat to 20 ° C . Compositional function calling is a native Live API feature. This means Live API can handle the function calling similar to the Python SDK. Python # Light control schemas turn_on_the_lights_schema = { 'name' : 'turn_on_the_lights' } \ No newline at end of file diff --git a/docstore/bf75a065-c40e-4863-8420-305e3f31bcb3 b/docstore/bf75a065-c40e-4863-8420-305e3f31bcb3 new file mode 100644 index 0000000000000000000000000000000000000000..da790334610df2eb1eb6ac9bb30354d4f1eafe8b --- /dev/null +++ b/docstore/bf75a065-c40e-4863-8420-305e3f31bcb3 @@ -0,0 +1 @@ +the output you want. Adding these examples can help the model identify the patterns and apply the relationship between the given images and responses to the new example. This is also called "few-shot" learning. In the example below, the initial output is written in sentence form, and also contains the country (Brazil). Suppose you want the output in a different format or style, and you want only the city, not the country. Adding few-shot examples to your prompt can steer the model to respond in the way you want. Prompt Model response Determine the city along with the landmark. The landmark is the Christ the Redeemer statue in Rio de Janeiro, Brazil. Updated prompt Improved response Determine the city along with the landmark. city: Rome, landmark: the Colosseum. city: Beijing, landmark: Forbidden City city: Rio de Janeiro, landmark: Christ the Redeemer statue Break it down step-by-step For complex tasks like the ones that require both visual understanding and reasoning, it can be helpful to split the task into smaller, more straightforward steps. Alternatively, it could also be effective if you directly ask the model to “think step by step” in your prompt. Prompt Model response When will I run out of toilet paper? Soon, you only have 3 rolls left. Updated prompt Improved response 1. First, count how many toilet paper rolls are in this picture. 2. Then, determine how much toilet paper a typical person uses per day. 3. Calculate how long these rolls of toilet paper will last. 1. There are 3 rolls of toilet paper in this picture. 2. A typical person uses about 20 sheets of toilet paper per day. 3. If each roll contains 200 sheets, then each roll will last for about 10 days. Therefore, the 3 rolls will last for about a month. Math problems or other types of word problems are great candidates for asking the model to think step-by-step. Prompt Response What is the 4th term in the sequence? -135 The response from the model is incorrect. Some ways to improve this is to ask \ No newline at end of file diff --git a/docstore/bf87c10c-1a3c-4468-b5f2-1914921ccd2e b/docstore/bf87c10c-1a3c-4468-b5f2-1914921ccd2e new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/bf87c10c-1a3c-4468-b5f2-1914921ccd2e @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/bf8bf1c1-69ca-4daf-9323-33bab9b0972a b/docstore/bf8bf1c1-69ca-4daf-9323-33bab9b0972a new file mode 100644 index 0000000000000000000000000000000000000000..805d10be613a3c55006267c8f5d9ce17fbca450d --- /dev/null +++ b/docstore/bf8bf1c1-69ca-4daf-9323-33bab9b0972a @@ -0,0 +1 @@ +declaration can include the following parameters: name (string): A unique name for the function ( get_weather_forecast , send_email ). Use descriptive names without spaces or special characters (use underscores or camelCase). description (string): A clear and detailed explanation of the function's purpose and capabilities. This is crucial for the model to understand when to use the function. Be specific and provide examples if helpful ("Finds theaters based on location and optionally movie title which is currently playing in theaters."). parameters (object): Defines the input parameters the function expects. type (string): Specifies the overall data type, such as object . properties (object): Lists individual parameters, each with: type (string): The data type of the parameter, such as string , integer , boolean, array . description (string): A description of the parameter's purpose and format. Provide examples and constraints ("The city and state, e.g., 'San Francisco, CA' or a zip code e.g., '95616'."). enum (array, optional): If the parameter values are from a fixed set, use "enum" to list the allowed values instead of just describing them in the description. This improves accuracy ("enum": ["daylight", "cool", "warm"]). required (array): An array of strings listing the parameter names that are mandatory for the function to operate. Function calling with thinking Enabling "thinking" can improve function call performance by allowing the model to reason through a request before suggesting function calls. However, because the Gemini API is stateless, this reasoning context is lost between turns, which can reduce the quality of function calls as they require multiple turn requests. To preserve this context you can use thought signatures. A thought signature is an encrypted representation of the model's internal thought process that you pass back to the model on subsequent turns. To use thought signatures: Receive the signature: When thinking is enabled, the API \ No newline at end of file diff --git a/docstore/bf9a2950-661f-4afc-b053-ceade78d1c77 b/docstore/bf9a2950-661f-4afc-b053-ceade78d1c77 new file mode 100644 index 0000000000000000000000000000000000000000..8b1657bd0e8a588bad85661db832fbb0d660fdd6 --- /dev/null +++ b/docstore/bf9a2950-661f-4afc-b053-ceade78d1c77 @@ -0,0 +1 @@ +serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Incremental content updates Use incremental updates to send text input, establish session context, or restore session context. For short contexts you can send turn-by-turn interactions to represent the exact sequence of events: Python turns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }]}, { "role" : "model" , "parts" : [{ "text" : "Paris" }]}, ] await session . send_client_content ( turns = turns , turn_complete = False ) turns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }]}] await session . send_client_content ( turns = turns , turn_complete = True ) JavaScript let inputTurns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }] }, { "role" : "model" , "parts" : [{ "text" : "Paris" }] }, ] session . sendClientContent ({ turns : inputTurns , turnComplete : false }) inputTurns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }] }] session . sendClientContent ({ turns : inputTurns , turnComplete : true }) For longer contexts \ No newline at end of file diff --git a/docstore/bfca05c4-eab6-4fb2-a0f1-395578e38c53 b/docstore/bfca05c4-eab6-4fb2-a0f1-395578e38c53 new file mode 100644 index 0000000000000000000000000000000000000000..9670cc28c51e28ab1129c38367a73e00f33ff60c --- /dev/null +++ b/docstore/bfca05c4-eab6-4fb2-a0f1-395578e38c53 @@ -0,0 +1 @@ +meal features brown rice, roasted vegetables, and chicken teriyaki. The brown rice is a healthy and complex carbohydrate that will give you sustained energy. The roasted vegetables are a great way to get your daily dose of vitamins and minerals, and the chicken teriyaki is a delicious and protein-rich option. This meal is also very easy to prepare. Simply cook the brown rice, roast the vegetables, and cook the chicken teriyaki. Once everything is cooked, divide it into meal prep containers and store them in the refrigerator. You can then grab a container and go on busy mornings! If you're looking for a healthy and delicious meal that can be easily prepped ahead of time, this meal is a great option. It's packed with nutrients and flavor, and it's sure to keep you feeling full and satisfied. Here's to healthy and delicious meal prepping! If you are having trouble getting the output you want from prompts that use media files, there are some strategies that can help you get the results you want. The following sections provide design approaches and troubleshooting tips for improving prompts that use multimodal input. You can improve your multimodal prompts by following these best practices: Prompt design fundamentals Be specific in your instructions : Craft clear and concise instructions that leave minimal room for misinterpretation. Add a few examples to your prompt: Use realistic few-shot examples to illustrate what you want to achieve. Break it down step-by-step : Divide complex tasks into manageable sub-goals, guiding the model through the process. Specify the output format : In your prompt, ask for the output to be in the format you want, like markdown, JSON, HTML and more. Put your image first for single-image prompts : While Gemini can handle image and text inputs in any order, for prompts containing a single image, it might perform better if that image (or video) is placed before the text prompt. However, for prompts that require images to be highly interleaved \ No newline at end of file diff --git a/docstore/bfdd4a0d-e072-45fd-b5e8-8e77d24744cb b/docstore/bfdd4a0d-e072-45fd-b5e8-8e77d24744cb new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/bfdd4a0d-e072-45fd-b5e8-8e77d24744cb @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/bfdea42a-48ae-4f5e-8034-b4abefd8418b b/docstore/bfdea42a-48ae-4f5e-8034-b4abefd8418b new file mode 100644 index 0000000000000000000000000000000000000000..17bfa7cb7ce514bf0ade86c26f1bff30fbb20a2e --- /dev/null +++ b/docstore/bfdea42a-48ae-4f5e-8034-b4abefd8418b @@ -0,0 +1 @@ +model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Describe this audio clip" , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } localAudioPath := "/path/to/sample.mp3" uploadedFile , _ := client . Files . UploadFromPath ( ctx , localAudioPath , nil , ) parts := [] * genai . Part { genai . NewPartFromText ( "Describe this audio clip" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST AUDIO_PATH = "path/to/sample.mp3" MIME_TYPE = $( file -b --mime-type " ${ AUDIO_PATH } " ) NUM_BYTES = $( wc -c < " ${ AUDIO_PATH } " ) DISPLAY_NAME = AUDIO tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ \ No newline at end of file diff --git a/docstore/bfef8db1-0050-4631-9d52-5355135d927d b/docstore/bfef8db1-0050-4631-9d52-5355135d927d new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/bfef8db1-0050-4631-9d52-5355135d927d @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/c002a379-e825-4ca3-a571-08dfeb02adb9 b/docstore/c002a379-e825-4ca3-a571-08dfeb02adb9 new file mode 100644 index 0000000000000000000000000000000000000000..0d39ec227c9e73c3d3b5117917b80f593b500b4b --- /dev/null +++ b/docstore/c002a379-e825-4ca3-a571-08dfeb02adb9 @@ -0,0 +1 @@ +async function main () { const ai = new GoogleGenAI ({}); const imageUrl = "https://goo.gle/instrument-img" ; const response = await fetch ( imageUrl ); const imageArrayBuffer = await response . arrayBuffer (); const base64ImageData = Buffer . from ( imageArrayBuffer ). toString ( 'base64' ); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ { inlineData : { mimeType : 'image/jpeg' , data : base64ImageData , }, }, { text : "Caption this image." } ], }); console . log ( result . text ); } main (); Go package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Download the image. imageResp , _ := http . Get ( "https://goo.gle/instrument-img" ) imageBytes , _ := io . ReadAll ( imageResp . Body ) parts := [] * genai . Part { genai . NewPartFromBytes ( imageBytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMG_URL = "https://goo.gle/instrument-img" MIME_TYPE = $( curl -sIL " $IMG_URL " | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1 ) if [[ -z " $MIME_TYPE " || ! " $MIME_TYPE " == image/* ]] ; then MIME_TYPE = "image/jpeg" fi # Check for macOS if [[ " $( uname ) " == "Darwin" ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -b 0 ) elif [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 ) else IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -w0 ) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ \ No newline at end of file diff --git a/docstore/c00b276d-e760-402e-a1e0-3521fa6174a8 b/docstore/c00b276d-e760-402e-a1e0-3521fa6174a8 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/c00b276d-e760-402e-a1e0-3521fa6174a8 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/c0132a9e-7ecc-4f61-a239-b0c8baf74d64 b/docstore/c0132a9e-7ecc-4f61-a239-b0c8baf74d64 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/c0132a9e-7ecc-4f61-a239-b0c8baf74d64 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/c024e9be-fa8f-48de-8113-ea2b31e631af b/docstore/c024e9be-fa8f-48de-8113-ea2b31e631af new file mode 100644 index 0000000000000000000000000000000000000000..4542d68c23b70ac5014267f60edbcffd2138877f --- /dev/null +++ b/docstore/c024e9be-fa8f-48de-8113-ea2b31e631af @@ -0,0 +1 @@ +, headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Streaming can also be used for multi-turn conversations. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message_stream ( "I have 2 dogs in my house." ) for chunk in response : print ( chunk . text , end = "" ) response = chat . send_message_stream ( "How many paws are in my house?" ) for chunk in response : print ( chunk . text , end = "" ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const stream1 = await chat . sendMessageStream ({ message : "I have 2 dogs in my house." , }); for await ( const chunk of stream1 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } const stream2 = await chat . sendMessageStream ({ message : "How many paws are in my house?" , }); for await ( const chunk of stream2 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . \ No newline at end of file diff --git a/docstore/c02cff2e-7fd2-4254-9a8c-04bf00cd2bed b/docstore/c02cff2e-7fd2-4254-9a8c-04bf00cd2bed new file mode 100644 index 0000000000000000000000000000000000000000..41dedb01cb0b9c984f39578d0001dc7776e6fe12 --- /dev/null +++ b/docstore/c02cff2e-7fd2-4254-9a8c-04bf00cd2bed @@ -0,0 +1 @@ +, 'IMAGE' ] ) ) for part in response . candidates [ 0 ] . content . parts : if part . text is not None : print ( part . text ) elif part . inline_data is not None : image = Image . open ( BytesIO (( part . inline_data . data ))) image . show () JavaScript Note: We've released the Google SDK for TypeScript and JavaScript in preview launch stage . Use this SDK for image generation features. import { GoogleGenAI , Modality } from "@google/genai" ; import * as fs from "node:fs" ; async function main () { const ai = new GoogleGenAI ({}); // Load the image from the local file system const imagePath = "path/to/image.png" ; const imageData = fs . readFileSync ( imagePath ); const base64Image = imageData . toString ( "base64" ); // Prepare the content parts const contents = [ { text : "Can you add a llama next to the image?" }, { inlineData : { mimeType : "image/png" , data : base64Image , }, }, ]; // Set responseModalities to include "Image" so the model can generate an image const response = await ai . models . generateContent ({ model : "gemini-2.0-flash-preview-image-generation" , contents : contents , config : { responseModalities : [ Modality . TEXT , Modality . IMAGE ], }, }); for ( const part of response . candidates [ 0 ]. content . parts ) { // Based on the part type, either show the text or save the image if ( part . text ) { console . log ( part . text ); } else if ( part . inlineData ) { const imageData = part . inlineData . data ; const buffer = Buffer . from ( imageData , "base64" ); fs . writeFileSync ( "gemini-native-image.png" , buffer ); console . log ( "Image saved as gemini-native-image.png" ); } } } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imagePath := "/path/to/image.png" imgData , _ := os . ReadFile ( imagePath ) parts := [] * genai . Part { genai . NewPartFromText ( "Hi, This is \ No newline at end of file diff --git a/docstore/c03e427c-b7ad-4cfc-b50f-73f030c5d02e b/docstore/c03e427c-b7ad-4cfc-b50f-73f030c5d02e new file mode 100644 index 0000000000000000000000000000000000000000..71a40ddf8f5869779c79b3a864f86f411dc17d60 --- /dev/null +++ b/docstore/c03e427c-b7ad-4cfc-b50f-73f030c5d02e @@ -0,0 +1 @@ +correct reasoning steps afterward. To disambiguate between those reasons, ask the model to describe what's in the image. In the following example, if the model responds with a snack that seems surprising when paired with tea (e.g. popcorn), you can first troubleshoot to determine whether the model correctly recognized that the image contains tea. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? Describe what's in this image. Another strategy is to ask the model to explain its reasoning. That can help you narrow down which part of the reasoning broke down, if any. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? What's a snack I can make in 1 minute that would go well with this? Please explain why. What's next Try writing your own multimodal prompts using Google AI Studio . For information on using the Gemini Files API for uploading media files and including them in your prompts, see the Vision , Audio , and Document processing guides. For more guidance on prompt design, like tuning sampling parameters, see the Prompt strategies page. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/c042db03-49bb-4faf-a2ee-8c9245e2d1ed b/docstore/c042db03-49bb-4faf-a2ee-8c9245e2d1ed new file mode 100644 index 0000000000000000000000000000000000000000..52610712dbf2353aaed52725f4b08c6fcab5e422 --- /dev/null +++ b/docstore/c042db03-49bb-4faf-a2ee-8c9245e2d1ed @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/video#prompt-guide Title: Generate video using Veo | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/c05fc689-259a-42ae-9290-4742c59894b6 b/docstore/c05fc689-259a-42ae-9290-4742c59894b6 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/c05fc689-259a-42ae-9290-4742c59894b6 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/c080989b-1f4f-4d0f-bf1a-190909983a85 b/docstore/c080989b-1f4f-4d0f-bf1a-190909983a85 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/c080989b-1f4f-4d0f-bf1a-190909983a85 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/c0b16fb2-4263-497e-b1d1-bc5bc9d798cc b/docstore/c0b16fb2-4263-497e-b1d1-bc5bc9d798cc new file mode 100644 index 0000000000000000000000000000000000000000..6b5570dc552776eef13cf8339199673fd1c28eb5 --- /dev/null +++ b/docstore/c0b16fb2-4263-497e-b1d1-bc5bc9d798cc @@ -0,0 +1 @@ +Generate an image Note: Image generation is only available in the paid tier. Generate an image: Python import base64 from openai import OpenAI from PIL import Image from io import BytesIO client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" , ) response = client . images . generate ( model = "imagen-3.0-generate-002" , prompt = "a portrait of a sheepadoodle wearing a cape" , response_format = 'b64_json' , n = 1 , ) for image_data in response . data : image = Image . open ( BytesIO ( base64 . b64decode ( image_data . b64_json ))) image . show () JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); async function main () { const image = await openai . images . generate ( { model : "imagen-3.0-generate-002" , prompt : "a portrait of a sheepadoodle wearing a cape" , response_format : "b64_json" , n : 1 , } ); console . log ( image . data ); } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/images/generations" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "imagen-3.0-generate-002", "prompt": "a portrait of a sheepadoodle wearing a cape", "response_format": "b64_json", "n": 1, }' Audio understanding Analyze audio input: Python import base64 from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) with open ( "/path/to/your/audio/file.wav" , "rb" ) as audio_file : base64_audio = base64 . b64encode ( audio_file . read ()) . decode ( 'utf-8' ) response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = [ { "role" : "user" , "content" : [ { "type" : "text" , "text" : "Transcribe this audio" , }, { "type" : "input_audio" , "input_audio" : { "data" : base64_audio , "format" : "wav" } } ], } ], ) print \ No newline at end of file diff --git a/docstore/c0c64309-3d5c-40a9-a85b-f86f4d6ffbd1 b/docstore/c0c64309-3d5c-40a9-a85b-f86f4d6ffbd1 new file mode 100644 index 0000000000000000000000000000000000000000..1d5a02022906f295c3ad625acee2d3f5c63827ae --- /dev/null +++ b/docstore/c0c64309-3d5c-40a9-a85b-f86f4d6ffbd1 @@ -0,0 +1 @@ +Aoede -- Breezy Callirrhoe -- Easy-going Autonoe -- Bright Enceladus -- Breathy Iapetus -- Clear Umbriel -- Easy-going Algieba -- Smooth Despina -- Smooth Erinome -- Clear Algenib -- Gravelly Rasalgethi -- Informative Laomedeia -- Upbeat Achernar -- Soft Alnilam -- Firm Schedar -- Even Gacrux -- Mature Pulcherrima -- Forward Achird -- Friendly Zubenelgenubi -- Casual Vindemiatrix -- Gentle Sadachbia -- Lively Sadaltager -- Knowledgeable Sulafat -- Warm You can hear all the voice options in AI Studio . Supported languages The TTS models detect the input language automatically. They support the following 24 languages: Language BCP-47 Code Language BCP-47 Code Arabic (Egyptian) ar-EG German (Germany) de-DE English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Korean (Korea) ko-KR Portuguese (Brazil) pt-BR Russian (Russia) ru-RU Dutch (Netherlands) nl-NL Polish (Poland) pl-PL Thai (Thailand) th-TH Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Romanian (Romania) ro-RO Ukrainian (Ukraine) uk-UA Bengali (Bangladesh) bn-BD English (India) en-IN & hi-IN bundle Marathi (India) mr-IN Tamil (India) ta-IN Telugu (India) te-IN Supported models Model Single speaker Multispeaker Gemini 2.5 Flash Preview TTS ✔️ ✔️ Gemini 2.5 Pro Preview TTS ✔️ ✔️ Limitations TTS models can only receive text inputs and generate audio outputs. A TTS session has a context window limit of 32k tokens. Review Languages section for language support. What's next Try the audio generation cookbook . Gemini's Live API offers interactive audio generation options you can interleave with other modalities. For working with audio inputs , visit the Audio understanding guide. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site \ No newline at end of file diff --git a/docstore/c0cb0708-00f8-4ffb-aa4f-32b1973a73eb b/docstore/c0cb0708-00f8-4ffb-aa4f-32b1973a73eb new file mode 100644 index 0000000000000000000000000000000000000000..34fafa88bef1190b729bdf255b8c99cfcd7b08b1 --- /dev/null +++ b/docstore/c0cb0708-00f8-4ffb-aa4f-32b1973a73eb @@ -0,0 +1 @@ +Use descriptive language : Use adjectives and adverbs to paint a clear picture for Veo. Provide context : If necessary, include background information to help your model understand what you want. Reference specific artistic styles : If you have a particular aesthetic in mind, reference specific artistic styles or art movements. Utilize prompt engineering tools : Consider exploring prompt engineering tools or resources to help you refine your prompts and achieve optimal results. For more information, visit Introduction to prompt design . Enhance the facial details in your personal and group images : Specify facial details as a focus of the photo like using the word portrait in the prompt. Example prompts and output This section presents several prompts, highlighting how descriptive details can elevate the outcome of each video. Icicles This video demonstrates how you can use the elements of prompt writing basics in your prompt. Prompt Generated output Close up shot (composition) of melting icicles (subject) on a frozen rock wall (context) with cool blue tones (ambiance), zoomed in (camera motion) maintaining close-up detail of water drips (action). Man on the phone These videos demonstrate how you can revise your prompt with increasingly specific details to get Veo to refine the output to your liking. Prompt Generated output Analysis The camera dollies to show a close up of a desperate man in a green trench coat. He's making a call on a rotary-style wall phone with a green neon light. It looks like a movie scene. This is the first generated video based on the prompt. A close-up cinematic shot follows a desperate man in a weathered green trench coat as he dials a rotary phone mounted on a gritty brick wall, bathed in the eerie glow of a green neon sign. The camera dollies in, revealing the tension in his jaw and the desperation etched on his face as he struggles to make the call. The shallow depth of field focuses on his furrowed brow and the black rotary phone, \ No newline at end of file diff --git a/docstore/c0cc0ad4-5534-4c35-bd3c-d44211ae3e05 b/docstore/c0cc0ad4-5534-4c35-bd3c-d44211ae3e05 new file mode 100644 index 0000000000000000000000000000000000000000..43d235dee43f472c269052714c9705874463b9cd --- /dev/null +++ b/docstore/c0cc0ad4-5534-4c35-bd3c-d44211ae3e05 @@ -0,0 +1 @@ +tools ]) # Define user prompt contents = [ types . Content ( role = "user" , parts = [ types . Part ( text = "Turn the lights down to a romantic level" )] ) ] # Send request with function declarations response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = contents config = config , ) print ( response . candidates [ 0 ] . content . parts [ 0 ] . function_call ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Generation config with function declaration const config = { tools : [{ functionDeclarations : [ setLightValuesFunctionDeclaration ] }] }; // Configure the client const ai = new GoogleGenAI ({}); // Define user prompt const contents = [ { role : 'user' , parts : [{ text : 'Turn the lights down to a romantic level' }] } ]; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( response . functionCalls [ 0 ]); The model then returns a functionCall object in an OpenAPI compatible schema specifying how to call one or more of the declared functions in order to respond to the user's question. Python id = None args = { 'color_temp' : 'warm' , 'brightness' : 25 } name = 'set_light_values' JavaScript { name : 'set_light_values' , args : { brightness : 25 , color_temp : 'warm' } } Step 3: Execute set_light_values function code Extract the function call details from the model's response, parse the arguments , and execute the set_light_values function. Python # Extract tool call details, it may not be in the first part. tool_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call if tool_call . name == "set_light_values" : result = set_light_values ( ** tool_call . args ) print ( f "Function execution result: { result } " ) JavaScript // Extract tool call details const tool_call = response . functionCalls [ 0 ] let result ; if ( tool_call . name === 'set_light_values' ) { result = \ No newline at end of file diff --git a/docstore/c1308d48-37fc-4289-9eed-0de70f802339 b/docstore/c1308d48-37fc-4289-9eed-0de70f802339 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/c1308d48-37fc-4289-9eed-0de70f802339 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/c13b082c-3b6a-45b6-bc93-07cff7e281cf b/docstore/c13b082c-3b6a-45b6-bc93-07cff7e281cf new file mode 100644 index 0000000000000000000000000000000000000000..0d6f2bf32b918d3f85636fdb1c53070e3d5509f6 --- /dev/null +++ b/docstore/c13b082c-3b6a-45b6-bc93-07cff7e281cf @@ -0,0 +1 @@ +overlay_filename = f " { item [ 'label' ] } _ { i } _overlay.png" mask . save ( os . path . join ( output_dir , mask_filename )) # Create and save overlay composite = Image . alpha_composite ( im . convert ( 'RGBA' ), overlay ) composite . save ( os . path . join ( output_dir , overlay_filename )) print ( f "Saved mask and overlay for { item [ 'label' ] } to { output_dir } " ) # Example usage if __name__ == "__main__" : extract_segmentation_masks ( "path/to/image.png" ) Check the segmentation example in the cookbook guide for a more detailed example. An example segmentation output with objects and segmentation masks Supported image formats Gemini supports the following image format MIME types: PNG - image/png JPEG - image/jpeg WEBP - image/webp HEIC - image/heic HEIF - image/heif Capabilities All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation. Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements. Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities: Gemini 2.0 models are further trained to support enhanced object detection . Gemini 2.5 models are further trained to support enhanced segmentation in addition to object detection . Limitations and key technical information File limit Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request. Token calculation Gemini 1.5 Flash and Gemini 1.5 Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens. Gemini 2.0 Flash and Gemini 2.5 Flash/Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each \ No newline at end of file diff --git a/docstore/c15294a6-d772-4b7c-84de-ccd1da255aea b/docstore/c15294a6-d772-4b7c-84de-ccd1da255aea new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/c15294a6-d772-4b7c-84de-ccd1da255aea @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/c1549bcc-285e-46bd-b5c1-1c62dec1d12d b/docstore/c1549bcc-285e-46bd-b5c1-1c62dec1d12d new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/c1549bcc-285e-46bd-b5c1-1c62dec1d12d @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/c1559155-54c9-4ec2-adfc-3834905b1534 b/docstore/c1559155-54c9-4ec2-adfc-3834905b1534 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/c1559155-54c9-4ec2-adfc-3834905b1534 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/c159917b-ba9d-42da-b811-03b8ec0cbcb1 b/docstore/c159917b-ba9d-42da-b811-03b8ec0cbcb1 new file mode 100644 index 0000000000000000000000000000000000000000..c95ce8529f78ed9807c80ac97da2c9c530df9edf --- /dev/null +++ b/docstore/c159917b-ba9d-42da-b811-03b8ec0cbcb1 @@ -0,0 +1 @@ +GenerateContentConfig { ResponseModalities : [] string { "TEXT" , "IMAGE" }, } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash-preview-image-generation" , genai . Text ( "Hi, can you create a 3d rendered image of a pig " + "with wings and a top hat flying over a happy " + "futuristic scifi city with lots of greenery?" ), config , ) for _ , part := range result . Candidates [ 0 ]. Content . Parts { if part . Text != "" { fmt . Println ( part . Text ) } else if part . InlineData != nil { imageBytes := part . InlineData . Data outputFilename := "gemini_generated_image.png" _ = os . WriteFile ( outputFilename , imageBytes , 0644 ) } } } REST curl -s -X POST "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-preview-image-generation:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts": [ {"text": "Hi, can you create a 3d rendered image of a pig with wings and a top hat flying over a happy futuristic scifi city with lots of greenery?"} ] }], "generationConfig":{"responseModalities":["TEXT","IMAGE"]} }' \ | grep -o '"data": "[^"]*"' \ | cut -d '"' -f4 \ | base64 --decode > gemini-native-image.png AI-generated image of a fantastical flying pig Image editing (text-and-image-to-image) To perform image editing, add an image as input. The following example demonstrates uploading base64 encoded images. For multiple images and larger payloads, check the image input section. Python from google import genai from google.genai import types from PIL import Image from io import BytesIO import PIL.Image image = PIL . Image . open ( '/path/to/image.png' ) client = genai . Client () text_input = ( 'Hi, This is a picture of me.' 'Can you add a llama next to me?' ,) response = client . models . generate_content ( model = "gemini-2.0-flash-preview-image-generation" , contents = [ text_input , image ], config = types . GenerateContentConfig ( response_modalities = [ 'TEXT' \ No newline at end of file diff --git a/docstore/c15d5770-b9d1-4de2-bd56-46dc3d85d943 b/docstore/c15d5770-b9d1-4de2-bd56-46dc3d85d943 new file mode 100644 index 0000000000000000000000000000000000000000..12eaa70edcc32ba40cc62f56233fa0adea94b474 --- /dev/null +++ b/docstore/c15d5770-b9d1-4de2-bd56-46dc3d85d943 @@ -0,0 +1 @@ +Gemini API libraries | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini API libraries When building with the Gemini API, we recommend using our official collection of libraries across major languages: the Google GenAI SDK . They are production ready under General Availability . Our samples and documentation across this site are built using these libraries. Note: If you're using one of our legacy libraries, we strongly recommend you migrate to the Google GenAI SDK. Review the legacy libraries section for more information. If you're new to the Gemini API, follow our quickstart guide to get started. Language support and installation The Google GenAI SDK is available for the Python, JavaScript/TypeScript, Go and Java languages. You can install each language's library using package managers, or visit their GitHub repos for further engagement: Python Library: google-genai GitHub Repository: googleapis/python-genai Installation: pip install google-genai JavaScript Library: @google/genai GitHub Repository: googleapis/js-genai Installation: npm install @google/genai Go Library: google.golang.org/genai GitHub Repository: googleapis/go-genai Installation: go get google.golang.org/genai Java Library: google-genai GitHub Repository: googleapis/java-genai Installation: If you're using Maven, add the following to your dependencies: com.google.genai google-genai 1.0.0 General availability We started rolling out the Google GenAI SDK in late 2024. As of May 2025, it reached General Availability (GA) across all supported platforms. This means \ No newline at end of file diff --git a/docstore/c17ca110-c410-424a-afab-a400f6a18714 b/docstore/c17ca110-c410-424a-afab-a400f6a18714 new file mode 100644 index 0000000000000000000000000000000000000000..1540f812946831d7975c774be202eaa83cf52504 --- /dev/null +++ b/docstore/c17ca110-c410-424a-afab-a400f6a18714 @@ -0,0 +1 @@ +"CiwBVKhc7nRyTi3HmggPD9iQiRc261f5jwuMdw3H/itDH0emsb9ZVo3Nwx9p6wpsAVSoXO5i8fDV4jBSBLoaWxB5zUdlGY6aIGp+I0oEnwRRSRQ1LOvrDlojEH8JE8HjiKXALdJrvNPiG+HY3GZEO8pZjEZtc3UoBUh7+SVyjK7Xolu7aRYYeUyzrCapoETWypER1jbrJXnFV23hCosBAVSoXO6oIPNJSmbuEDfGafOhuCSHkpr1yjTp35RXYqmCESzRzWf5+nFXLqncqeFo4ohoxbiYQVpVQbOZF81p8o9zg6xeRE7qMeOv+XN7enXGJ4/s3qNFQpfkSMqRdBITN1VpX7jyfEAjvxBNc7PDfDJZmEPY338ZIY5nFFcmzJSWjVrboFt2sMFv+A==" } ] , "role" : "model" } , "finishReason" : "STOP" , "index" : 0 } ] , # Remainder of response... You can confirm that you received a signature and see what a signature looks like using the following code: # Step 2: Call the model with function declarations # ...Generation config, Configure the client, and Define user prompt (No changes) # Send request with declarations (using a thinking model) response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents ) # See thought signatures for part in response . candidates [ 0 ] . content . parts : if part . thought_signature : print ( "Thought signature:" ) print ( part . thought_signature ) Returning signatures back to the server In order to return signatures back: You should return signatures along with their containing parts back to the server You shouldn't merge a part with a signature with another part which also contains a signature. The signature string is not concatenable You shouldn't merge one part with a signature with another part without a signature. This breaks the correct positioning of the thought represented by the signature. The code will remain the same as in Step 4 of the previous section. But in this case (as indicated in the comment below) you will return signatures to the model along with the result of the function execution so the model can incorporate the thoughts into its final response: Python # Step 4: Create user friendly response with function result and call the model again # ...Create a function response part (No change) # Append thought \ No newline at end of file diff --git a/docstore/c1812e75-8670-4643-b5ad-97780734b863 b/docstore/c1812e75-8670-4643-b5ad-97780734b863 new file mode 100644 index 0000000000000000000000000000000000000000..bebef94a2670f1eebe768f4cb4680db515bfe743 --- /dev/null +++ b/docstore/c1812e75-8670-4643-b5ad-97780734b863 @@ -0,0 +1 @@ +it's recommended to provide a single message summary to free up the context window for subsequent interactions. See Session Resumption for another method for loading session context. Sending and receiving audio The most common audio example, audio-to-audio , is covered in the Getting started guide. Here's an audio-to-text example that reads a WAV file, sends it in the correct format and receives text output: Python # Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav # Install helpers for converting files: pip install librosa soundfile import asyncio import io from pathlib import Path from google import genai from google.genai import types import soundfile as sf import librosa client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : buffer = io . BytesIO () y , sr = librosa . load ( "sample.wav" , sr = 16000 ) sf . write ( buffer , y , sr , format = 'RAW' , subtype = 'PCM_16' ) buffer . seek ( 0 ) audio_bytes = buffer . read () # If already in correct format, you can use this: # audio_bytes = Path("sample.pcm").read_bytes() await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav // Install helpers for converting files: npm install wavefile import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue \ No newline at end of file diff --git a/docstore/c1943673-6e7d-4f4f-9c35-c0542901140f b/docstore/c1943673-6e7d-4f4f-9c35-c0542901140f new file mode 100644 index 0000000000000000000000000000000000000000..ea67dfb950f5bf1f1ddc040da181eb1accb5974b --- /dev/null +++ b/docstore/c1943673-6e7d-4f4f-9c35-c0542901140f @@ -0,0 +1 @@ +breakdown:" ) for detail in usage . response_tokens_details : match detail : case types . ModalityTokenCount ( modality = modality , token_count = count ): print ( f " { modality } : { count } " ) JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . usageMetadata ) { console . debug ( 'Used %s tokens in total. Response token breakdown:\n' , turn . usageMetadata . totalTokenCount ); for ( const detail of turn . usageMetadata . responseTokensDetails ) { console . debug ( '%s\n' , detail ); } } } Media resolution You can specify the media resolution for the input media by setting the mediaResolution field as part of the session configuration: Python from google.genai import types config = { "response_modalities" : [ "AUDIO" ], "media_resolution" : types . MediaResolution . MEDIA_RESOLUTION_LOW , } JavaScript import { GoogleGenAI , Modality , MediaResolution } from '@google/genai' ; const config = { responseModalities : [ Modality . TEXT ], mediaResolution : MediaResolution . MEDIA_RESOLUTION_LOW , }; Limitations Consider the following limitations of the Live API when you plan your project. Response modalities You can only set one response modality ( TEXT or AUDIO ) per session in the session configuration. Setting both results in a config error message. This means that you can configure the model to respond with either text or audio, but not both in the same session. Client authentication The Live API only provides server-to-server authentication by default. If you're implementing your Live API application using a client-to-server approach , you need to use ephemeral tokens to mitigate security risks. Session duration Audio-only sessions are limited to 15 minutes, and audio plus video sessions are limited to 2 minutes. However, you can configure different session management techniques for unlimited extensions on session duration. Context window A session has a context window limit of: 128k tokens for native audio output models 32k \ No newline at end of file diff --git a/docstore/c1960fd9-fc2b-4832-bf46-6b5af1ed30ab b/docstore/c1960fd9-fc2b-4832-bf46-6b5af1ed30ab new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/c1960fd9-fc2b-4832-bf46-6b5af1ed30ab @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/c1ae5205-30dd-49f7-b321-b0004213e384 b/docstore/c1ae5205-30dd-49f7-b321-b0004213e384 new file mode 100644 index 0000000000000000000000000000000000000000..54ff3139001cad531cb76ca5ae25b2688a321ffa --- /dev/null +++ b/docstore/c1ae5205-30dd-49f7-b321-b0004213e384 @@ -0,0 +1 @@ +angle," "worms eye," "dolly shot," "zoom shot," "pan shot," and "tracking shot." Focus and lens effects: Use terms like "shallow focus," "deep focus," "soft focus," "macro lens," and "wide-angle lens" to achieve specific visual effects. Overall style and subject: Guide Veo's creative direction by specifying styles like "sci-fi," "romantic comedy," "action movie," or "animation." You can also describe the subjects and backgrounds you want, such as "cityscape," "nature," "vehicles," or "animals." Veo prompt guide This section of the Veo guide contains examples of videos you can create using Veo, and shows you how to modify prompts to produce distinct results. Safety filters Veo applies safety filters across Gemini to help ensure that generated videos and uploaded photos don't contain offensive content. Prompts that violate our terms and guidelines are blocked. Prompt writing basics Good prompts are descriptive and clear. To get your generated video as close as possible to what you want, start with identifying your core idea, and then refine your idea by adding keywords and modifiers. The following elements should be included in your prompt: Subject : The object, person, animal, or scenery that you want in your video. Context : The background or context in which the subject is placed. Action : What the subject is doing (for example, walking , running , or turning their head ). Style : This can be general or very specific. Consider using specific film style keywords, such as horror film , film noir , or animated styles like cartoon style. Camera motion : [Optional] What the camera is doing, such as aerial view , eye-level , top-down shot , or low-angle shot . Composition : [Optional] How the shot is framed, such as wide shot , close-up , or extreme close-up . Ambiance : [Optional] How the color and light contribute to the scene, such as blue tones , night , or warm tones . More tips for writing prompts The following tips help you write prompts that generate your videos: \ No newline at end of file diff --git a/docstore/c1c83373-0467-4a10-82db-06100f3dff5e b/docstore/c1c83373-0467-4a10-82db-06100f3dff5e new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/c1c83373-0467-4a10-82db-06100f3dff5e @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/c1caabbf-a355-4306-a817-fd3bf139bb07 b/docstore/c1caabbf-a355-4306-a817-fd3bf139bb07 new file mode 100644 index 0000000000000000000000000000000000000000..8d1e7e8a7a566917639b9515f69a698a37f27a8d --- /dev/null +++ b/docstore/c1caabbf-a355-4306-a817-fd3bf139bb07 @@ -0,0 +1 @@ +to avoid. ⛔ Negative pattern: Example Don't end haikus with a question: Haiku are fun A short and simple poem Don't you enjoy them? ✅ Positive pattern: Example Always end haikus with an assertion: Haiku are fun A short and simple poem A joy to write Consistent formatting Make sure that the structure and formatting of few-shot examples are the same to avoid responses with undesired formats. One of the primary objectives of adding few-shot examples in prompts is to show the model the response format. Therefore, it is essential to ensure a consistent format across all examples, especially paying attention to XML tags, white spaces, newlines, and example splitters. Add context You can include instructions and information in a prompt that the model needs to solve a problem, instead of assuming that the model has all of the required information. This contextual information helps the model understand the constraints and details of what you're asking for it to do. The following example asks the model to give troubleshooting guidance for a router: Prompt: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Response: A yellow blinking light on your Google Wifi router usually indicates a problem with the internet connection. Here are some steps to troubleshoot and fix the issue: 1. Restart your router and modem: - Unplug your modem from the power outlet and wait for 30 seconds. - Plug your modem back in and wait for it to fully boot up. - Unplug your Google Wifi router and wait for 30 seconds. - Plug your Google Wifi router back in and wait for it to fully boot up. ..... (gemini-2.5-flash) The response looks like generic troubleshooting information that's not specific to the router or the status of the LED indicator lights. To customize the response for the specific router, you can add to the prompt the router's troubleshooting guide as context for it to refer to when providing a response. Prompt: Answer the question \ No newline at end of file diff --git a/docstore/c1effeeb-57ad-4ae7-8827-a8d81e102d39 b/docstore/c1effeeb-57ad-4ae7-8827-a8d81e102d39 new file mode 100644 index 0000000000000000000000000000000000000000..f019890956a490f4b806038c108c4d7d0e98e7a2 --- /dev/null +++ b/docstore/c1effeeb-57ad-4ae7-8827-a8d81e102d39 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/speech-generation#languages Title: Speech generation (text-to-speech) | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/c243129f-fa63-402c-8f09-e98d412fd6c0 b/docstore/c243129f-fa63-402c-8f09-e98d412fd6c0 new file mode 100644 index 0000000000000000000000000000000000000000..2576a967489affc6bfdb929d64cf063c798b38f9 --- /dev/null +++ b/docstore/c243129f-fa63-402c-8f09-e98d412fd6c0 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-2.5-flash Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/c277f1ca-c0cc-46e7-9de5-8725b7662041 b/docstore/c277f1ca-c0cc-46e7-9de5-8725b7662041 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/c277f1ca-c0cc-46e7-9de5-8725b7662041 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/c27870a8-b6ce-41ed-95f5-0ab31bd08ff5 b/docstore/c27870a8-b6ce-41ed-95f5-0ab31bd08ff5 new file mode 100644 index 0000000000000000000000000000000000000000..8ae6e7f8faf81825a858a0fb6935f6e2a3e8dc09 --- /dev/null +++ b/docstore/c27870a8-b6ce-41ed-95f5-0ab31bd08ff5 @@ -0,0 +1 @@ +help with that, as I'm only a language model." If the model responds with a fallback response, try increasing the temperature. Things to avoid Avoid relying on models to generate factual information. Use with care on math and logic problems. Generative models under the hood This section aims to answer the question - Is there randomness in generative models' responses, or are they deterministic? The short answer - yes to both. When you prompt a generative model, a text response is generated in two stages. In the first stage, the generative model processes the input prompt and generates a probability distribution over possible tokens (words) that are likely to come next. For example, if you prompt with the input text "The dog jumped over the ... ", the generative model will produce an array of probable next words: [("fence", 0.77), ("ledge", 0.12), ("blanket", 0.03), ...] This process is deterministic; a generative model will produce this same distribution every time it's input the same prompt text. In the second stage, the generative model converts these distributions into actual text responses through one of several decoding strategies. A simple decoding strategy might select the most likely token at every timestep. This process would always be deterministic. However, you could instead choose to generate a response by randomly sampling over the distribution returned by the model. This process would be stochastic (random). Control the degree of randomness allowed in this decoding process by setting the temperature. A temperature of 0 means only the most likely tokens are selected, and there's no randomness. Conversely, a high temperature injects a high degree of randomness into the tokens selected by the model, leading to more unexpected, surprising model responses. Next steps Now that you have a deeper understanding of prompt design, try writing your own prompts using Google AI Studio . To learn about multimodal prompting, see Prompting with media files . To learn \ No newline at end of file diff --git a/docstore/c27bee2d-78d7-44b3-a6a9-868db8836ec1 b/docstore/c27bee2d-78d7-44b3-a6a9-868db8836ec1 new file mode 100644 index 0000000000000000000000000000000000000000..7c3d98af9909034f92832ab4dcc3a7220e5c9856 --- /dev/null +++ b/docstore/c27bee2d-78d7-44b3-a6a9-868db8836ec1 @@ -0,0 +1 @@ +operation = await ai . models . generateVideos ({ model : "veo-2.0-generate-001" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , image : { imageBytes : response . generatedImages [ 0 ]. image . imageBytes , // response from Imagen mimeType : "image/png" , }, config : { aspectRatio : "16:9" , numberOfVideos : 2 , }, }); while ( ! operation . done ) { await new Promise (( resolve ) = > setTimeout ( resolve , 10000 )); operation = await ai . operations . getVideosOperation ({ operation : operation , }); } operation . response ? . generatedVideos ? . forEach ( async ( generatedVideo , n ) = > { const resp = await fetch ( ` ${ generatedVideo . video ? . uri } & key=GEMINI_API_KEY` , // append your API key ); const writer = createWriteStream ( `video ${ n } .mp4` ); Readable . fromWeb ( resp . body ). pipe ( writer ); }); } main (); Go image := response . GeneratedImages [ 0 ]. Image videoConfig := & genai . GenerateVideosConfig { AspectRatio : "16:9" , NumberOfVideos : 2 , } operation , _ := client . Models . GenerateVideos ( ctx , "veo-2.0-generate-001" , "A dramatic scene based on the input image" , image , videoConfig , ) for ! operation . Done { time . Sleep ( 20 * time . Second ) operation , _ = client . Operations . GetVideosOperation ( ctx , operation , nil ) } for n , video := range operation . Response . GeneratedVideos { client . Files . Download ( ctx , video . Video , nil ) fname := fmt . Sprintf ( "video_with_image_input_%d.mp4" , n ) _ = os . WriteFile ( fname , video . Video . VideoBytes , 0644 ) } Veo model parameters (Naming conventions vary by programming language.) prompt : The text prompt for the video. When present, the image parameter is optional. image : The image to use as the first frame for the video. When present, the prompt parameter is optional. negativePrompt : Text string that describes anything you want to discourage the model from generating aspectRatio : Changes the aspect ratio of the generated video. \ No newline at end of file diff --git a/docstore/c27c6800-155f-4313-9e02-a3a8ff239179 b/docstore/c27c6800-155f-4313-9e02-a3a8ff239179 new file mode 100644 index 0000000000000000000000000000000000000000..a99dcaa8b13f3b83504ad1b35c796f8fc297615c --- /dev/null +++ b/docstore/c27c6800-155f-4313-9e02-a3a8ff239179 @@ -0,0 +1 @@ +] # Execute the prompt with specified tools in audio modality await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Multiple tasks example - combining lights, code execution, and search const prompt = ` Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! ` ; const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } // not defined here. ]; // Execute the prompt with specified tools in audio modality await run ( prompt , { tools : tools , modality : "AUDIO" }); Python developers can try this out in the Live API Tool Use notebook . Model context protocol (MCP) Model Context Protocol (MCP) is an open standard for connecting AI applications with external tools and data. MCP provides a common protocol for models to access context, such as functions (tools), data sources (resources), or predefined prompts. The Gemini SDKs have built-in support for the MCP, reducing boilerplate code and offering automatic tool calling for MCP tools. When the model generates an MCP tool call, the Python and JavaScript client SDK can automatically execute the MCP tool and send the response back to the model in a subsequent request, continuing this loop until no more tool calls are made by the model. Here, you can find an example of how to use a local MCP server with Gemini and mcp SDK. Python Make sure the latest version of the mcp SDK is installed on your platform of choice. pip install mcp Note: Python supports automatic tool calling by passing in the ClientSession into the tools parameters. If you want to disable it, you can provide automatic_function_calling with disabled True . import os import asyncio from datetime import datetime from mcp import ClientSession , StdioServerParameters from \ No newline at end of file diff --git a/docstore/c29ba94b-4bf9-4990-a320-08206cf79b68 b/docstore/c29ba94b-4bf9-4990-a320-08206cf79b68 new file mode 100644 index 0000000000000000000000000000000000000000..750217269b5a3e53dd2d92de822b68418a6799df --- /dev/null +++ b/docstore/c29ba94b-4bf9-4990-a320-08206cf79b68 @@ -0,0 +1 @@ +' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Get metadata for a file You can verify that the API successfully stored the uploaded file and get its metadata by calling files.get . Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) file_name = myfile . name myfile = client . files . get ( name = file_name ) print ( myfile ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; const fetchedFile = await ai . files . get ({ name : fileName }); console . log ( fetchedFile ); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } gotFile , err := client . GetFile ( ctx , file . Name ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Got file:" , gotFile . Name ) REST # file_info.json was created in the upload example name = $( jq ".file.name" file_info.json ) # Get the file of interest to check state curl https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " > file_info.json # Print some information about the file you got name = $( jq ".file.name" file_info.json ) echo name = $name file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri List uploaded files You can upload multiple files using the Files API. The following code gets a list of all the files uploaded: Python print ( 'My files:' ) for f in client . files . list (): print ( ' ' , f . name ) JavaScript const listResponse = await ai . files . list ({ config : { pageSize : 10 } }); for await ( const file of listResponse ) { console . log ( file . name ); } Go iter := client . ListFiles ( ctx ) for { ifile , err := iter . Next () if err == iterator . Done { break } if err != nil { log . Fatal ( err ) } fmt . Println ( ifile . Name ) } REST echo "My files: " curl \ No newline at end of file diff --git a/docstore/c2a63fd4-3400-48aa-9a6f-abbc2d615963 b/docstore/c2a63fd4-3400-48aa-9a6f-abbc2d615963 new file mode 100644 index 0000000000000000000000000000000000000000..42ae2796f9fa6a593373179df1e744bd9e72e4bd --- /dev/null +++ b/docstore/c2a63fd4-3400-48aa-9a6f-abbc2d615963 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/video-understanding#inline-video Title: Video understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/c2b20c46-2789-4120-a96a-184b44e50f38 b/docstore/c2b20c46-2789-4120-a96a-184b44e50f38 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/c2b20c46-2789-4120-a96a-184b44e50f38 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/c2b596bb-5c37-46b7-ab2d-24420af0c86d b/docstore/c2b596bb-5c37-46b7-ab2d-24420af0c86d new file mode 100644 index 0000000000000000000000000000000000000000..cc967ffee72bfb803a27c22b45a7304a203928ac --- /dev/null +++ b/docstore/c2b596bb-5c37-46b7-ab2d-24420af0c86d @@ -0,0 +1 @@ +the model to parse the problem before answering the prompt. Prompt Response Parse the formula in the image first. Then based on the formula, answer "what's the 4th term in the sequence?". Click to expand the result $$ b(n) = b(n-1) * (-3) $$ $$ b(1) = 15 $$ Substitute n = 4 into the formula: $$ b(4) = b(4-1) * (-3) $$ $$ b(4) = b(3) * (-3) $$ We don't know the value of b(3), so substitute n = 3 into the formula: $$ b(3) = b(3-1) * (-3) $$ $$ b(3) = b(2) * (-3) $$ We don't know the value of b(2), so substitute n = 2 into the formula: $$ b(2) = b(2-1) * (-3) $$ $$ b(2) = b(1) * (-3) $$ We know the value of b(1) is 15, so substitute it into the formula: $$ b(2) = 15 * (-3) $$ $$ b(2) = -45 $$ Now that we know the value of b(2), substitute it into the formula for b(3): $$ b(3) = b(2) * (-3) $$ $$ b(3) = -45 * (-3) $$ $$ b(3) = 135 $$ Now that we know the value of b(3), substitute it into the formula for b(4): $$ b(4) = b(3) * (-3) $$ $$ b(4) = 135 * (-3) $$ $$ b(4) = -405 $$ Therefore, the 4th term in the sequence is -405. This model response is correct - now to make the response even more streamlined, you can ask the model to to think step-by-step. Updated prompt Improved response What is the 4th term in the sequence? Think step by step. The first term is 15. The second term is b(1) * (-3) = 15 * (-3) = -45. The third term is b(2) * (-3) = -45 * (-3) = 135. The fourth term is b(3) * (-3) = 135 * (-3) = -405. Specify the output format A common problem is the need for model output to be in a certain format (e.g. markdown, JSON, HTML), especially if the model output needs to be ingested by a downstream task. You can try to produce output in that format by instructing the model to do so within the prompt. Below are two examples: Prompt Model response Parse the table in this image into markdown format | Subject | Status | Time | Start date | Due on | | ------- | ------ | ---- | ---------- | ------ | | Math | Done | 1.5 hrs | 11/29/2023 | 12/1/2023 | | Geography | Done | 30 \ No newline at end of file diff --git a/docstore/c2b5a1c9-24b5-4aec-801a-a28ba301d189 b/docstore/c2b5a1c9-24b5-4aec-801a-a28ba301d189 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/c2b5a1c9-24b5-4aec-801a-a28ba301d189 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/c2cecef2-8e2a-49e1-9931-07a8b5968756 b/docstore/c2cecef2-8e2a-49e1-9931-07a8b5968756 new file mode 100644 index 0000000000000000000000000000000000000000..6e71e94222e9c44768c28e09ebada72b5ff1e76f --- /dev/null +++ b/docstore/c2cecef2-8e2a-49e1-9931-07a8b5968756 @@ -0,0 +1 @@ +writeFileSync ( `imagen- ${ idx } .png` , buffer ); idx ++ ; } } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateImagesConfig { NumberOfImages : 4 , } response , _ := client . Models . GenerateImages ( ctx , "imagen-4.0-generate-preview-06-06" , "Robot holding a red skateboard" , config , ) for n , image := range response . GeneratedImages { fname := fmt . Sprintf ( "imagen-%d.png" , n ) _ = os . WriteFile ( fname , image . Image . ImageBytes , 0644 ) } } REST curl -X POST \ "https://generativelanguage.googleapis.com/v1beta/models/imagen-4.0-generate-preview-06-06:predict" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -d '{ "instances": [ { "prompt": "Robot holding a red skateboard" } ], "parameters": { "sampleCount": 4 } }' AI-generated image of a robot holding a red skateboard Imagen configuration Imagen supports English only prompts at this time and the following parameters: Note: Naming conventions of parameters vary by programming language. numberOfImages : The number of images to generate, from 1 to 4 (inclusive). The default is 4. For Imagen 4 Ultra, it defaults to 1 as only one image can be generated at a time. aspectRatio : Changes the aspect ratio of the generated image. Supported values are "1:1" , "3:4" , "4:3" , "9:16" , and "16:9" . The default is "1:1" . personGeneration : Allow the model to generate images of people. The following values are supported: "dont_allow" : Block generation of images of people. "allow_adult" : Generate images of adults, but not children. This is the default. "allow_all" : Generate images that include adults and children. Note: The "allow_all" parameter value is not allowed in EU, UK, CH, MENA locations. Choosing the right model Choose Gemini when: You need contextually relevant images that leverage \ No newline at end of file diff --git a/docstore/c2fa302a-7e65-401f-a0a6-604df9376fcb b/docstore/c2fa302a-7e65-401f-a0a6-604df9376fcb new file mode 100644 index 0000000000000000000000000000000000000000..b0571e28c8e74f7e3e23139b08c0865b24edbd38 --- /dev/null +++ b/docstore/c2fa302a-7e65-401f-a0a6-604df9376fcb @@ -0,0 +1 @@ +And you can also pass the schema as JSON: Python from google import genai client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = 'What type of instrument is an oboe?' , config = { 'response_mime_type' : 'text/x.enum' , 'response_schema' : { "type" : "STRING" , "enum" : [ "Percussion" , "String" , "Woodwind" , "Brass" , "Keyboard" ], }, }, ) print ( response . text ) # Woodwind Beyond basic multiple choice problems, you can use an enum anywhere in a JSON schema. For example, you could ask the model for a list of recipe titles and use a Grade enum to give each title a popularity grade: Python from google import genai import enum from pydantic import BaseModel class Grade ( enum . Enum ): A_PLUS = "a+" A = "a" B = "b" C = "c" D = "d" F = "f" class Recipe ( BaseModel ): recipe_name : str rating : Grade client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = 'List 10 home-baked cookie recipes and give them grades based on tastiness.' , config = { 'response_mime_type' : 'application/json' , 'response_schema' : list [ Recipe ], }, ) print ( response . text ) The response might look like this: [ { "recipe_name" : "Chocolate Chip Cookies" , "rating" : "a+" }, { "recipe_name" : "Peanut Butter Cookies" , "rating" : "a" }, { "recipe_name" : "Oatmeal Raisin Cookies" , "rating" : "b" }, ... ] About JSON schemas Configuring the model for JSON output using responseSchema parameter relies on Schema object to define its structure. This object represents a select subset of the OpenAPI 3.0 Schema object , and also adds a propertyOrdering field. Tip: On Python, when you use a Pydantic model, you don't need to directly work with Schema objects, as it gets automatically converted to the corresponding JSON schema. To learn more, see JSON schemas in Python . Here's a pseudo-JSON representation of all the Schema fields: { "type": enum (Type), "format": string, "description": \ No newline at end of file diff --git a/docstore/c30ada10-2bcc-4aa0-b5f2-3d036fbc822c b/docstore/c30ada10-2bcc-4aa0-b5f2-3d036fbc822c new file mode 100644 index 0000000000000000000000000000000000000000..5b23b75839f7d9f5e86c0814ceb13216aba4c820 --- /dev/null +++ b/docstore/c30ada10-2bcc-4aa0-b5f2-3d036fbc822c @@ -0,0 +1 @@ +Using Gemini API keys | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Using Gemini API keys To use the Gemini API, you need an API key. You can create a key for free with a few clicks in Google AI Studio . Once you have an API key, you have the following options to connect to the Gemini API: Setting your API key as an environment variable Providing your API key explicitly For initial testing, you can hard code an API key, but this should only be temporary since it's not secure. You can find examples for hard coding the API key in Providing API key explicitly section. Setting API key as environment variable If you set the environment variable GEMINI_API_KEY or GOOGLE_API_KEY , the API key will automatically be picked up by the client when using one of the Gemini API libraries . It's recommended that you set only one of those variables, but if both are set, GOOGLE_API_KEY takes precedence. If you're using the REST API, or JavaScript on the browser, you will need to provide the API key explicitly. Here is how you can set your API key locally as the environment variable GEMINI_API_KEY with different operating systems. Linux/macOS - Bash Bash is a common Linux and macOS terminal configuration. You can check if you have a configuration file for it by running the following command: ~/.bashrc If the response is "No such file or directory", you will need to create this file and open it by running the following commands, or use zsh : touch ~/.bashrc open ~/.bashrc Next, you need to set your API key by adding the following export command: export GEMINI_API_KEY = After saving the file, apply the changes by running: source ~/.bashrc macOS \ No newline at end of file diff --git a/docstore/c31b81db-f148-4e09-9ac2-5c5f9b865b4c b/docstore/c31b81db-f148-4e09-9ac2-5c5f9b865b4c new file mode 100644 index 0000000000000000000000000000000000000000..465170b0f695eefaa56d7f3871f4f3df2424da62 --- /dev/null +++ b/docstore/c31b81db-f148-4e09-9ac2-5c5f9b865b4c @@ -0,0 +1 @@ +brightness color_temp: Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. Returns: A dictionary containing the set brightness and color temperature. """ return { "brightness" : brightness , "colorTemperature" : color_temp } JavaScript import { Type } from '@google/genai' ; // Define a function that the model can call to control smart lights const setLightValuesFunctionDeclaration = { name : 'set_light_values' , description : 'Sets the brightness and color temperature of a light.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'Light level from 0 to 100. Zero is off and 100 is full brightness' , }, color_temp : { type : Type . STRING , enum : [ 'daylight' , 'cool' , 'warm' ], description : 'Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`.' , }, }, required : [ 'brightness' , 'color_temp' ], }, }; /** * Set the brightness and color temperature of a room light. (mock API) * @param {number} brightness - Light level from 0 to 100. Zero is off and 100 is full brightness * @param {string} color_temp - Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. * @return {Object} A dictionary containing the set brightness and color temperature. */ function setLightValues ( brightness , color_temp ) { return { brightness : brightness , colorTemperature : color_temp }; } Step 2: Call the model with function declarations Once you have defined your function declarations, you can prompt the model to use them. It analyzes the prompt and function declarations and decides whether to respond directly or to call a function. If a function is called, the response object will contain a function call suggestion. Python from google.genai import types # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ set_light_values_declaration ]) config = types . GenerateContentConfig ( tools = [ \ No newline at end of file diff --git a/docstore/c3335216-9d38-4e36-a61f-76d17abb1595 b/docstore/c3335216-9d38-4e36-a61f-76d17abb1595 new file mode 100644 index 0000000000000000000000000000000000000000..06648438cf900fddf03ee010d162bb03653f66d3 --- /dev/null +++ b/docstore/c3335216-9d38-4e36-a61f-76d17abb1595 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.5-flash-preview-tts Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/c33c8f23-cf5e-470b-ae30-9aeca2f46586 b/docstore/c33c8f23-cf5e-470b-ae30-9aeca2f46586 new file mode 100644 index 0000000000000000000000000000000000000000..e8472db3f1b37b3dc2d13dd06406d0e42fc75dfb --- /dev/null +++ b/docstore/c33c8f23-cf5e-470b-ae30-9aeca2f46586 @@ -0,0 +1 @@ +costing 258 tokens. Tips and best practices Verify that images are correctly rotated. Use clear, non-blurry images. When using a single image with text, place the text prompt after the image part in the contents array. What's next This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources: Files API : Learn more about uploading and managing files for use with Gemini. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/c38925ef-738b-481b-8d07-e1be342d97d4 b/docstore/c38925ef-738b-481b-8d07-e1be342d97d4 new file mode 100644 index 0000000000000000000000000000000000000000..63d9f8f1c5d6f296f9334ea760bebfcc6dc4a24c --- /dev/null +++ b/docstore/c38925ef-738b-481b-8d07-e1be342d97d4 @@ -0,0 +1 @@ +Gemini thinking | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini thinking The Gemini 2.5 series models use an internal "thinking process" that significantly improves their reasoning and multi-step planning abilities, making them highly effective for complex tasks such as coding, advanced mathematics, and data analysis. This guide shows you how to work with Gemini's thinking capabilities using the Gemini API. Before you begin Ensure you use a supported 2.5 series model for thinking. You might find it beneficial to explore these models in AI Studio before diving into the API: Try Gemini 2.5 Flash in AI Studio Try Gemini 2.5 Pro in AI Studio Try Gemini 2.5 Flash-Lite Preview in AI Studio Generating content with thinking Initiating a request with a thinking model is similar to any other content generation request. The key difference lies in specifying one of the models with thinking support in the model field, as demonstrated in the following text generation example: Python from google import genai client = genai . Client () prompt = "Explain the concept of Occam's Razor and provide a simple, everyday example." response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = prompt ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const prompt = "Explain the concept of Occam's Razor and provide a simple, everyday example." ; const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : prompt , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" \ No newline at end of file diff --git a/docstore/c390b569-4c5e-400b-98d7-39bc2ed659b1 b/docstore/c390b569-4c5e-400b-98d7-39bc2ed659b1 new file mode 100644 index 0000000000000000000000000000000000000000..9670cc28c51e28ab1129c38367a73e00f33ff60c --- /dev/null +++ b/docstore/c390b569-4c5e-400b-98d7-39bc2ed659b1 @@ -0,0 +1 @@ +meal features brown rice, roasted vegetables, and chicken teriyaki. The brown rice is a healthy and complex carbohydrate that will give you sustained energy. The roasted vegetables are a great way to get your daily dose of vitamins and minerals, and the chicken teriyaki is a delicious and protein-rich option. This meal is also very easy to prepare. Simply cook the brown rice, roast the vegetables, and cook the chicken teriyaki. Once everything is cooked, divide it into meal prep containers and store them in the refrigerator. You can then grab a container and go on busy mornings! If you're looking for a healthy and delicious meal that can be easily prepped ahead of time, this meal is a great option. It's packed with nutrients and flavor, and it's sure to keep you feeling full and satisfied. Here's to healthy and delicious meal prepping! If you are having trouble getting the output you want from prompts that use media files, there are some strategies that can help you get the results you want. The following sections provide design approaches and troubleshooting tips for improving prompts that use multimodal input. You can improve your multimodal prompts by following these best practices: Prompt design fundamentals Be specific in your instructions : Craft clear and concise instructions that leave minimal room for misinterpretation. Add a few examples to your prompt: Use realistic few-shot examples to illustrate what you want to achieve. Break it down step-by-step : Divide complex tasks into manageable sub-goals, guiding the model through the process. Specify the output format : In your prompt, ask for the output to be in the format you want, like markdown, JSON, HTML and more. Put your image first for single-image prompts : While Gemini can handle image and text inputs in any order, for prompts containing a single image, it might perform better if that image (or video) is placed before the text prompt. However, for prompts that require images to be highly interleaved \ No newline at end of file diff --git a/docstore/c3944e51-0aef-4587-897b-4f4ca6331406 b/docstore/c3944e51-0aef-4587-897b-4f4ca6331406 new file mode 100644 index 0000000000000000000000000000000000000000..13dae0738dbe1cc658da4f8d80ffe7f33c50362c --- /dev/null +++ b/docstore/c3944e51-0aef-4587-897b-4f4ca6331406 @@ -0,0 +1 @@ +(Multimodal Live API) Models supported All Gemini 2.0 and 2.5 models Only Flash experimental models File input types supported .png, .jpeg, .csv, .xml, .cpp, .java, .py, .js, .ts .png, .jpeg, .csv, .xml, .cpp, .java, .py, .js, .ts Plotting libraries supported Matplotlib, seaborn Matplotlib, seaborn Multi-tool use Yes (code execution + grounding only) Yes Billing There's no additional charge for enabling code execution from the Gemini API. You'll be billed at the current rate of input and output tokens based on the Gemini model you're using. Here are a few other things to know about billing for code execution: You're only billed once for the input tokens you pass to the model, and you're billed for the final output tokens returned to you by the model. Tokens representing generated code are counted as output tokens. Generated code can include text and multimodal output like images. Code execution results are also counted as output tokens. The billing model is shown in the following diagram: You're billed at the current rate of input and output tokens based on the Gemini model you're using. If Gemini uses code execution when generating your response, the original prompt, the generated code, and the result of the executed code are labeled intermediate tokens and are billed as input tokens . Gemini then generates a summary and returns the generated code, the result of the executed code, and the final summary. These are billed as output tokens . The Gemini API includes an intermediate token count in the API response, so you know why you're getting additional input tokens beyond your initial prompt. Limitations The model can only generate and execute code. It can't return other artifacts like media files. In some cases, enabling code execution can lead to regressions in other areas of model output (for example, writing a story). There is some variation in the ability of the different models to use code execution successfully. Supported libraries The code execution \ No newline at end of file diff --git a/docstore/c3997f5c-78a0-48f1-83d5-6de17188fbef b/docstore/c3997f5c-78a0-48f1-83d5-6de17188fbef new file mode 100644 index 0000000000000000000000000000000000000000..750217269b5a3e53dd2d92de822b68418a6799df --- /dev/null +++ b/docstore/c3997f5c-78a0-48f1-83d5-6de17188fbef @@ -0,0 +1 @@ +' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Get metadata for a file You can verify that the API successfully stored the uploaded file and get its metadata by calling files.get . Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) file_name = myfile . name myfile = client . files . get ( name = file_name ) print ( myfile ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; const fetchedFile = await ai . files . get ({ name : fileName }); console . log ( fetchedFile ); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } gotFile , err := client . GetFile ( ctx , file . Name ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Got file:" , gotFile . Name ) REST # file_info.json was created in the upload example name = $( jq ".file.name" file_info.json ) # Get the file of interest to check state curl https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " > file_info.json # Print some information about the file you got name = $( jq ".file.name" file_info.json ) echo name = $name file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri List uploaded files You can upload multiple files using the Files API. The following code gets a list of all the files uploaded: Python print ( 'My files:' ) for f in client . files . list (): print ( ' ' , f . name ) JavaScript const listResponse = await ai . files . list ({ config : { pageSize : 10 } }); for await ( const file of listResponse ) { console . log ( file . name ); } Go iter := client . ListFiles ( ctx ) for { ifile , err := iter . Next () if err == iterator . Done { break } if err != nil { log . Fatal ( err ) } fmt . Println ( ifile . Name ) } REST echo "My files: " curl \ No newline at end of file diff --git a/docstore/c3d7a1e2-76c5-422a-8452-85e090d02174 b/docstore/c3d7a1e2-76c5-422a-8452-85e090d02174 new file mode 100644 index 0000000000000000000000000000000000000000..64b38d40afbaa776eeced04508049a0f469e337d --- /dev/null +++ b/docstore/c3d7a1e2-76c5-422a-8452-85e090d02174 @@ -0,0 +1 @@ +OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function main () { const messages = [{ "role" : "user" , "content" : "What's the weather like in Chicago today?" }]; const tools = [ { "type" : "function" , "function" : { "name" : "get_weather" , "description" : "Get the weather in a given location" , "parameters" : { "type" : "object" , "properties" : { "location" : { "type" : "string" , "description" : "The city and state, e.g. Chicago, IL" , }, "unit" : { "type" : "string" , "enum" : [ "celsius" , "fahrenheit" ]}, }, "required" : [ "location" ], }, } } ]; const response = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : messages , tools : tools , tool_choice : "auto" , }); console . log ( response ); } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.0-flash", "messages": [ { "role": "user", "content": "What' \' 's the weather like in Chicago today?" } ], "tools": [ { "type": "function", "function": { "name": "get_weather", "description": "Get the current weather in a given location", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "The city and state, e.g. Chicago, IL" }, "unit": { "type": "string", "enum": ["celsius", "fahrenheit"] } }, "required": ["location"] } } } ], "tool_choice": "auto" }' Image understanding Gemini models are natively multimodal and provide best in class performance on many common vision tasks . Python import base64 from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) # Function to encode the image def encode_image ( image_path ): with open ( image_path , "rb" ) as image_file : return base64 . b64encode ( \ No newline at end of file diff --git a/docstore/c3e697f0-b909-42a8-86c0-76edb561b2ec b/docstore/c3e697f0-b909-42a8-86c0-76edb561b2ec new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/c3e697f0-b909-42a8-86c0-76edb561b2ec @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/c3e76c86-089a-4217-ade8-d8ef150fad75 b/docstore/c3e76c86-089a-4217-ade8-d8ef150fad75 new file mode 100644 index 0000000000000000000000000000000000000000..d464a7e5141c7bcc5fa86ba919979db27614ba5c --- /dev/null +++ b/docstore/c3e76c86-089a-4217-ade8-d8ef150fad75 @@ -0,0 +1 @@ +Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/c3f56d86-bc2a-47aa-baf9-590ada3b8a90 b/docstore/c3f56d86-bc2a-47aa-baf9-590ada3b8a90 new file mode 100644 index 0000000000000000000000000000000000000000..846f589921f766089772715bc1a3853935a191ce --- /dev/null +++ b/docstore/c3f56d86-bc2a-47aa-baf9-590ada3b8a90 @@ -0,0 +1 @@ +batch_status.json ) if [[ $batch_state = "JOB_STATE_SUCCEEDED" ]] ; then if [[ $( jq '.response | has("inlinedResponses")' batch_status.json ) = "true" ]] ; then jq -r '.response.inlinedResponses' batch_status.json exit fi responses_file_name = $( jq -r '.response.responsesFile' batch_status.json ) curl https://generativelanguage.googleapis.com/download/v1beta/ $responses_file_name :download?alt = media \ -H "x-goog-api-key: $GEMINI_API_KEY " 2 > /dev/null elif [[ $batch_state = "JOB_STATE_FAILED" ]] ; then jq '.error' batch_status.json elif [[ $batch_state == "JOB_STATE_CANCELLED" ]] ; then echo "Batch was cancelled by the user" fi Cancelling a batch job You can cancel an ongoing batch job using its name. When a job is canceled, it stops processing new requests. Python # Cancel a batch job client . batches . cancel ( name = batch_job_to_cancel . name ) REST BATCH_NAME = "batches/123456" # Your batch job name # Cancel the batch curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME :cancel \ -H "x-goog-api-key: $GEMINI_API_KEY " \ # Confirm that the status of the batch after cancellation is JOB_STATE_CANCELLED curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type:application/json" 2 > /dev/null | jq -r '.metadata.state' Deleting a batch job You can delete an existing batch job using its name. When a job is deleted, it stops processing new requests and is removed from the list of batch jobs. Python # Delete a batch job client . batches . delete ( name = batch_job_to_delete . name ) REST BATCH_NAME = "batches/123456" # Your batch job name # Cancel the batch curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME :delete \ -H "x-goog-api-key: $GEMINI_API_KEY " \ Technical details Supported models: Batch Mode supports a range of Gemini models. Refer to the Models page for the latest list of compatible models. The supported modalities for Batch Mode are the same as what's \ No newline at end of file diff --git a/docstore/c3f7951b-0e54-4fcc-8db6-ebbbad73972e b/docstore/c3f7951b-0e54-4fcc-8db6-ebbbad73972e new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/c3f7951b-0e54-4fcc-8db6-ebbbad73972e @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/c3fe3133-06c8-434c-bcd7-11400ef80871 b/docstore/c3fe3133-06c8-434c-bcd7-11400ef80871 new file mode 100644 index 0000000000000000000000000000000000000000..7dc87b548e2d57526821a9c12df5e47c7e7e0e83 --- /dev/null +++ b/docstore/c3fe3133-06c8-434c-bcd7-11400ef80871 @@ -0,0 +1 @@ +. thoughtsTokenCount } ` ); console . log ( `Output tokens: ${ response . usageMetadata . candidatesTokenCount } ` ); Go // ... usageMetadata , err := json . MarshalIndent ( response . UsageMetadata , "" , " " ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Thoughts tokens:" , string ( usageMetadata . thoughts_token_count )) fmt . Println ( "Output tokens:" , string ( usageMetadata . candidates_token_count )) Thinking models generate full thoughts to improve the quality of the final response, and then output summaries to provide insight into the thought process. So, pricing is based on the full thought tokens the model needs to generate to create a summary, despite only the summary being output from the API. You can learn more about tokens in the Token counting guide. Supported models Thinking features are supported on all the 2.5 series models. You can find all model capabilities on the model overview page. Best practices This section includes some guidance for using thinking models efficiently. As always, following our prompting guidance and best practices will get you the best results. Debugging and steering Review reasoning : When you're not getting your expected response from the thinking models, it can help to carefully analyze Gemini's thought summaries. You can see how it broke down the task and arrived at its conclusion, and use that information to correct towards the right results. Provide Guidance in Reasoning : If you're hoping for a particularly lengthy output, you may want to provide guidance in your prompt to constrain the amount of thinking the model uses. This lets you reserve more of the token output for your response. Task complexity Easy Tasks (Thinking could be OFF): For straightforward requests where complex reasoning isn't required, such as fact retrieval or classification, thinking is not required. Examples include: "Where was DeepMind founded?" "Is this email asking for a meeting or just providing information?" Medium Tasks \ No newline at end of file diff --git a/docstore/c41d9e90-31b2-4515-966d-614db1ad00e4 b/docstore/c41d9e90-31b2-4515-966d-614db1ad00e4 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/c41d9e90-31b2-4515-966d-614db1ad00e4 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/c4212c8d-08bc-49bd-b859-792ca5db9ae8 b/docstore/c4212c8d-08bc-49bd-b859-792ca5db9ae8 new file mode 100644 index 0000000000000000000000000000000000000000..cd1a6469d32d4344455628e24b4f24d47cbf3ee6 --- /dev/null +++ b/docstore/c4212c8d-08bc-49bd-b859-792ca5db9ae8 @@ -0,0 +1 @@ +new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "List a few popular cookie recipes, and include the amounts of ingredients." , config : { responseMimeType : "application/json" , responseSchema : { type : Type . ARRAY , items : { type : Type . OBJECT , properties : { recipeName : { type : Type . STRING , }, ingredients : { type : Type . ARRAY , items : { type : Type . STRING , }, }, }, propertyOrdering : [ "recipeName" , "ingredients" ], }, }, }, }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { ResponseMIMEType : "application/json" , ResponseSchema : & genai . Schema { Type : genai . TypeArray , Items : & genai . Schema { Type : genai . TypeObject , Properties : map [ string ] * genai . Schema { "recipeName" : { Type : genai . TypeString }, "ingredients" : { Type : genai . TypeArray , Items : & genai . Schema { Type : genai . TypeString }, }, }, PropertyOrdering : [] string { "recipeName" , "ingredients" }, }, }, } result , err := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "List a few popular cookie recipes, and include the amounts of ingredients." ), config , ) if err != nil { log . Fatal ( err ) } fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "contents": [{ "parts":[ { "text": "List a few popular cookie recipes, and include the amounts of ingredients." } ] }], "generationConfig": { "responseMimeType": "application/json", "responseSchema": { "type": "ARRAY", "items": { "type": "OBJECT", "properties": { "recipeName": { \ No newline at end of file diff --git a/docstore/c42a5013-e3ea-4077-aa02-df0f03e7fbc6 b/docstore/c42a5013-e3ea-4077-aa02-df0f03e7fbc6 new file mode 100644 index 0000000000000000000000000000000000000000..39af00a0c9a9d748f5b17ca6db1ebff5b823541e --- /dev/null +++ b/docstore/c42a5013-e3ea-4077-aa02-df0f03e7fbc6 @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " Delete uploaded files Files are automatically deleted after 48 hours. You can also manually delete an uploaded file: Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) client . files . delete ( name = myfile . name ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; await ai . files . delete ({ name : fileName }); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } client . DeleteFile ( ctx , file . Name ) REST curl --request "DELETE" https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " Usage info You can use the Files API to upload and interact with media files. The Files API lets you store up to 20 GB of files per project, with a per-file maximum size of 2 GB. Files are stored for 48 hours. During that time, you can use the API to get metadata about the files, but you can't download the files. The Files API is available at no cost in all regions where the Gemini API is available. File prompting strategies This section provides guidance and best practices for using media files with prompts for the Gemini API. Being able to use various types of data in your prompts gives you more flexibility in terms of what tasks you can tackle with the Gemini API. For example, you can send the model a photo of a delicious meal and ask it to write a short blog about the meal. Prompt Response Write a short, engaging blog post based on this picture. It should include a description of the meal in the photo and talk about my journey meal prepping. Meal prepping is a great way to save time and money, and it can also help you to eat healthier. This meal is a great example of a healthy and delicious meal that can be easily prepped ahead of time. This \ No newline at end of file diff --git a/docstore/c43a1fdd-c5d5-48e5-8018-b5aee3f528ba b/docstore/c43a1fdd-c5d5-48e5-8018-b5aee3f528ba new file mode 100644 index 0000000000000000000000000000000000000000..4a588ba82271aaa08f2df12295049c23bd57f7eb --- /dev/null +++ b/docstore/c43a1fdd-c5d5-48e5-8018-b5aee3f528ba @@ -0,0 +1 @@ +, ) print ( response . text ) # The SDK handles the function call and returns the final text You can disable automatic function calling with: Python config = types . GenerateContentConfig ( tools = [ get_current_temperature ], automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ) ) Automatic function schema declaration Automatic schema extraction from Python functions doesn't work in all cases. For example, it doesn't handle cases where you describe the fields of a nested dictionary-object. The API is able to describe any of the following types: Python AllowedType = ( int | float | bool | str | list [ 'AllowedType' ] | dict [ str , AllowedType ]) To see what the inferred schema looks like, you can convert it using from_callable : Python def multiply ( a : float , b : float ): """Returns a * b.""" return a * b fn_decl = types . FunctionDeclaration . from_callable ( callable = multiply , client = client ) # to_json_dict() provides a clean JSON representation. print ( fn_decl . to_json_dict ()) Multi-tool use: Combine native tools with function calling You can enable multiple tools combining native tools with function calling at the same time. Here's an example that enables two tools, Grounding with Google Search and code execution , in a request using the Live API . Note: Multi-tool use is a- Live API only feature at the moment. The run() function declaration, which handles the asynchronous websocket setup, is omitted for brevity. Python # Multiple tasks example - combining lights, code execution, and search prompt = """ Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! """ tools = [ { 'google_search' : {}}, { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} # not defined here. \ No newline at end of file diff --git a/docstore/c44ea26a-9ac8-48ac-b27c-8f30e811f048 b/docstore/c44ea26a-9ac8-48ac-b27c-8f30e811f048 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/c44ea26a-9ac8-48ac-b27c-8f30e811f048 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/c46c4bea-8b96-46e8-bbc9-f560f71d3179 b/docstore/c46c4bea-8b96-46e8-bbc9-f560f71d3179 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/c46c4bea-8b96-46e8-bbc9-f560f71d3179 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/c49479c6-da9f-4ee5-9c15-dba69be5186a b/docstore/c49479c6-da9f-4ee5-9c15-dba69be5186a new file mode 100644 index 0000000000000000000000000000000000000000..0cc2840ba15806724fb46627efd1d77acb27b134 --- /dev/null +++ b/docstore/c49479c6-da9f-4ee5-9c15-dba69be5186a @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/live#response-modalities Title: Get started with Live API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/c495d63e-876e-4b3c-9e2d-10a0985b9863 b/docstore/c495d63e-876e-4b3c-9e2d-10a0985b9863 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/c495d63e-876e-4b3c-9e2d-10a0985b9863 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/c498ac8e-f5ab-4996-b68a-633676ffbec5 b/docstore/c498ac8e-f5ab-4996-b68a-633676ffbec5 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/c498ac8e-f5ab-4996-b68a-633676ffbec5 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/c4cddd1c-2064-46e3-91e1-09f53ee1d4fa b/docstore/c4cddd1c-2064-46e3-91e1-09f53ee1d4fa new file mode 100644 index 0000000000000000000000000000000000000000..2dce4b1915975420243f156ab22de6a07e8b5cc9 --- /dev/null +++ b/docstore/c4cddd1c-2064-46e3-91e1-09f53ee1d4fa @@ -0,0 +1 @@ +The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/c4d6c2e0-0e2e-4b34-9ca3-e997e62b7055 b/docstore/c4d6c2e0-0e2e-4b34-9ca3-e997e62b7055 new file mode 100644 index 0000000000000000000000000000000000000000..45d17d6ac3f5b7951c085e76c40e76fbe5fe62ea --- /dev/null +++ b/docstore/c4d6c2e0-0e2e-4b34-9ca3-e997e62b7055 @@ -0,0 +1 @@ +"thinkingConfig": { "thinkingBudget": 1024 # Thinking off: # "thinkingBudget": 0 # Turn on dynamic thinking: # "thinkingBudget": -1 } } }' Thought summaries Thought summaries are synthesized versions of the model's raw thoughts and offer insights into the model's internal reasoning process. Note that thinking budgets apply to the model's raw thoughts and not to thought summaries. You can enable thought summaries by setting includeThoughts to true in your request configuration. You can then access the summary by iterating through the response parameter's parts , and checking the thought boolean. Here's an example demonstrating how to enable and retrieve thought summaries without streaming, which returns a single, final thought summary with the response: Python from google import genai from google.genai import types client = genai . Client () prompt = "What is the sum of the first 50 prime numbers?" response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = prompt , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( include_thoughts = True ) ) ) for part in response . candidates [ 0 ] . content . parts : if not part . text : continue if part . thought : print ( "Thought summary:" ) print ( part . text ) print () else : print ( "Answer:" ) print ( part . text ) print () JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : "What is the sum of the first 50 prime numbers?" , config : { thinkingConfig : { includeThoughts : true , }, }, }); for ( const part of response . candidates [ 0 ]. content . parts ) { if ( ! part . text ) { continue ; } else if ( part . thought ) { console . log ( "Thoughts summary:" ); console . log ( part . text ); } else { console . log ( "Answer:" ); console . log ( part . text ); } } } main (); Go package main import ( "context" \ No newline at end of file diff --git a/docstore/c4d7869e-6a41-4dc3-9d86-224b73df2fcd b/docstore/c4d7869e-6a41-4dc3-9d86-224b73df2fcd new file mode 100644 index 0000000000000000000000000000000000000000..91fd6dbcb4d807434a341e23a941ef0850298bc5 --- /dev/null +++ b/docstore/c4d7869e-6a41-4dc3-9d86-224b73df2fcd @@ -0,0 +1 @@ +transcript' , config = types . GenerateContentConfig ( cached_content = apollo_cache . name , ) ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const filePath = path . join ( media , "a11.txt" ); const document = await ai . files . upload ({ file : filePath , config : { mimeType : "text/plain" }, }); console . log ( "Uploaded file name:" , document . name ); const modelName = "gemini-1.5-flash" ; const contents = [ createUserContent ( createPartFromUri ( document . uri , document . mimeType )), ]; const cache = await ai . caches . create ({ model : modelName , config : { contents : contents , systemInstruction : "You are an expert analyzing transcripts." , }, }); console . log ( "Cache created:" , cache ); const response = await ai . models . generateContent ({ model : modelName , contents : "Please summarize this transcript" , config : { cachedContent : cache . name }, }); console . log ( "Response text:" , response . text ); Count tokens Count the number of tokens in a request. Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . count_tokens ( 'The quick brown fox jumps over the lazy dog.' ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY+); const model = genAI.getGenerativeModel({ model: " gemini - 1.5 - flash ", }); // Count tokens in a prompt without calling text generation. const countResult = await model.countTokens( " The quick brown fox jumps over the lazy dog . ", ); console.log(countResult.totalTokens); // 11 const generateResult = await model.generateContent( " The quick brown fox jumps over the lazy dog . " , ); // On the response for `generateContent`, use `usageMetadata` // to get separate input and output token counts // (`promptTokenCount` and `candidatesTokenCount`, respectively), // as well as the combined token count \ No newline at end of file diff --git a/docstore/c4dc6fe5-970d-4adc-a60d-614912341f00 b/docstore/c4dc6fe5-970d-4adc-a60d-614912341f00 new file mode 100644 index 0000000000000000000000000000000000000000..cba7e39aa9222f7493d03657c4d4a4fea7949da3 --- /dev/null +++ b/docstore/c4dc6fe5-970d-4adc-a60d-614912341f00 @@ -0,0 +1,3 @@ +URL: https://ai.google.dev/gemini-api/docs/tokens#main-content Title: Understand and count tokens | Gemini API | Google AI for Developers ================================================== + +Understand and count tokens | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Understand and count tokens Python JavaScript Go Gemini and other generative AI models process input and output at a granularity called a token . About tokens Tokens can be single characters like z or whole words like cat . Long words are broken up into several tokens. The set of all tokens used by the model is called the vocabulary, and the process of splitting text into tokens is called tokenization . For Gemini models, a token is equivalent to about 4 characters. 100 tokens is equal to about 60-80 English words. When billing is enabled, the cost of a call to the Gemini API is determined in part by the number of input and output tokens, so knowing how to count tokens can be helpful. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/c4ecc95e-8baa-4a2f-8729-f9b5b9e274b3 b/docstore/c4ecc95e-8baa-4a2f-8729-f9b5b9e274b3 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/c4ecc95e-8baa-4a2f-8729-f9b5b9e274b3 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/c5193215-bcf9-4485-8492-dcf7c727388e b/docstore/c5193215-bcf9-4485-8492-dcf7c727388e new file mode 100644 index 0000000000000000000000000000000000000000..a8da0e92acb39cd3b0a95b76d425835072520bc1 --- /dev/null +++ b/docstore/c5193215-bcf9-4485-8492-dcf7c727388e @@ -0,0 +1 @@ +Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/docstore/c53b5de1-7f19-4615-8eb5-240bc09bef47 b/docstore/c53b5de1-7f19-4615-8eb5-240bc09bef47 new file mode 100644 index 0000000000000000000000000000000000000000..9b0480f81615786d4da1611ac72ab96b5af43602 --- /dev/null +++ b/docstore/c53b5de1-7f19-4615-8eb5-240bc09bef47 @@ -0,0 +1 @@ +"CiIBVKhc7oDPpCaXyJKKssjqr4g3JNOSgJ/M2V+1THC1icsWCmwBVKhc7pBABbZ+zR3e9234WnWWS6GFXmf8IVwpnzjd5KYd7vyJbn/4vTorWBGayj/vbd9JPaZQjxdAIXhoE5mX/MDsQ7M9N/b0qJjHm39tYIBvS4sIWkMDHqTJqXGLzhhKtrTkfbV3RbaJEkQKmwEBVKhc7qVUgC3hfTXZLo9R3AJzUUIx50NKvJTb9B+UU+LBqgg7Nck1x5OpjWVS2R+SsveprIuYOruk2Y0H53J2OJF8qsxTdIq2si8DGW2V7WK8xyoJH5kbqd7drIw1jLb44b6lx4SMyB0VaULuTBki4d+Ljjg1tJTwR0IYMKqDLDZt9mheINsi0ZxcNjfpnDydRXdWbcSwzmK/wgqJAQFUqFzuKgNVElxs3cbO+xebr2IwcOro84nKTisi0tTp9bICPC9fTUhn3L+rvQWA+d3J1Za8at2bakrqiRj7BTh+CVO9fWQMAEQAs3ni0Z2hfaYG92tOD26E4IoZwyYEoWbfNudpH1fr5tEkyqnEGtWIh7H+XoZQ2DXeiOa+br7Zk88SrNE+trJMCogBAVSoXO5e9fBLg7hnbkmKsrzNLnQtLsQm1gNzjcjEC7nJYklYPp0KI2uGBE1PkM8XNsfllAfHVn7LzHcHNlbQ9pJ7QZTSIeG42goS971r5wNZwxaXwCTphClQh826eqJWo6A/28TtAVQWLhTx5ekbP7qb4nh1UblESZ1saxDQAEo4OKPbDzx5BgqKAQFUqFzuVyjNm5i0wN8hTDnKjfpDroEpPPTs531iFy9BOX+xDCdGHy8D+osFpaoBq6TFekQQbz4hIoUR1YEcP4zI80/cNimEeb9IcFxZTTxiNrbhbbcv0969DSMWhB+ZEqIz4vuw4GLe/xcUvqhlChQwFdgIbdOQHSHpatn5uDlktnP/bi26nKuXIwo0AVSoXO7US22OUH7d1f4abNPI0IyAvhqkPp12rbtWLx9vkOtojE8IP+xCfYtIFuZIzRNZqA==" } ] , "role" : "model" } , { "role" : "user" , "parts" : [ { "functionResponse" : { "name" : "getWeather" , "response" : { "response" : { "stringValue" : "Sunny and hot. 90 degrees Fahrenheit" } } } } ] } ] , # Remainder of request... Learn more about limitations and usage of thought signatures, and about thinking models in general, on the Thinking page. Parallel function calling In addition to single turn function calling, you can also call multiple functions at once. Parallel function calling lets you execute multiple functions at once and is used when the functions are not dependent on each other. This is useful in scenarios like gathering data from multiple independent sources, such as retrieving customer details from different databases or checking inventory levels across various warehouses or performing multiple actions such as converting your apartment into a disco. Python power_disco_ball = { "name" : "power_disco_ball" , "description" \ No newline at end of file diff --git a/docstore/c56fb6bb-f348-490c-a9c4-62cd9fa7183d b/docstore/c56fb6bb-f348-490c-a9c4-62cd9fa7183d new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/c56fb6bb-f348-490c-a9c4-62cd9fa7183d @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/c5a8f292-3967-41ce-93c7-ebee7336d184 b/docstore/c5a8f292-3967-41ce-93c7-ebee7336d184 new file mode 100644 index 0000000000000000000000000000000000000000..086087b92745c4e9b643012a145762582d8d8dd5 --- /dev/null +++ b/docstore/c5a8f292-3967-41ce-93c7-ebee7336d184 @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "What' \' 's the temperature in London?" } ] } ], "tools": [ { "functionDeclarations": [ { "name": "get_current_temperature", "description": "Gets the current temperature for a given location.", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "The city name, e.g. San Francisco" } }, "required": ["location"] } } ] } ] }' How function calling works Function calling involves a structured interaction between your application, the model, and external functions. Here's a breakdown of the process: Define Function Declaration: Define the function declaration in your application code. Function Declarations describe the function's name, parameters, and purpose to the model. Call LLM with function declarations: Send user prompt along with the function declaration(s) to the model. It analyzes the request and determines if a function call would be helpful. If so, it responds with a structured JSON object. Execute Function Code (Your Responsibility): The Model does not execute the function itself. It's your application's responsibility to process the response and check for Function Call, if Yes : Extract the name and args of the function and execute the corresponding function in your application. No: The model has provided a direct text response to the prompt (this flow is less emphasized in the example but is a possible outcome). Create User friendly response: If a function was executed, capture the result and send it back to the model in a subsequent turn of the conversation. It will use the result to generate a final, user-friendly response that incorporates the information from the function call. This process can be repeated over multiple turns, allowing for complex interactions and \ No newline at end of file diff --git a/docstore/c5b096e1-078b-49d7-95da-bbf29f2b1557 b/docstore/c5b096e1-078b-49d7-95da-bbf29f2b1557 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/c5b096e1-078b-49d7-95da-bbf29f2b1557 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/c5b222f3-4244-429a-81df-d79d6475b935 b/docstore/c5b222f3-4244-429a-81df-d79d6475b935 new file mode 100644 index 0000000000000000000000000000000000000000..7c01a8e52c6f77d499351d1468f50bccd1328e77 --- /dev/null +++ b/docstore/c5b222f3-4244-429a-81df-d79d6475b935 @@ -0,0 +1 @@ +function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); // Combine audio data strings and save as wave file const combinedAudio = turns . reduce (( acc , turn ) = > { if ( turn . data ) { const buffer = Buffer . from ( turn . data , 'base64' ); const intArray = new Int16Array ( buffer . buffer , buffer . byteOffset , buffer . byteLength / Int16Array . BYTES_PER_ELEMENT ); return acc . concat ( Array . from ( intArray )); } return acc ; }, []); const audioBuffer = new Int16Array ( combinedAudio ); const wf = new WaveFile (); wf . fromScratch ( 1 , 24000 , '16' , audioBuffer ); fs . writeFileSync ( 'output.wav' , wf . toBuffer ()); session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Audio formats Audio data in the Live API is always raw, little-endian, 16-bit PCM. Audio output always uses a sample rate of 24kHz. Input audio is natively 16kHz, but the Live API will resample if needed so any sample rate can be sent. To convey the sample rate of input audio, set the MIME type of each audio-containing Blob to a value like audio/pcm;rate=16000 . Audio transcriptions You can enable transcription of the model's audio output by sending output_audio_transcription in the setup config. The transcription language is inferred from the model's response. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ], "output_audio_transcription" : {} } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello? Gemini are you there?" await session . send_client_content ( turns = { \ No newline at end of file diff --git a/docstore/c5c4d263-6bf8-4f3c-9d25-04582db46d1d b/docstore/c5c4d263-6bf8-4f3c-9d25-04582db46d1d new file mode 100644 index 0000000000000000000000000000000000000000..a5a6b39c097d8cbbd04646d1c0a8361a10d2c9ae --- /dev/null +++ b/docstore/c5c4d263-6bf8-4f3c-9d25-04582db46d1d @@ -0,0 +1 @@ +meanings as statements, which means that a RAG system won't automatically recognize their relation. Task types enable you to generate optimized embeddings for specific tasks, saving you time and cost and improving performance. Python from google import genai from google.genai import types client = genai . Client () result = client . models . embed_content ( model = "gemini-embedding-exp-03-07" , contents = "What is the meaning of life?" , config = types . EmbedContentConfig ( task_type = "SEMANTIC_SIMILARITY" ) ) print ( result . embeddings ) JavaScript import { GoogleGenAI } from "@google/genai" ; async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . embedContent ({ model : 'gemini-embedding-exp-03-07' , contents : 'What is the meaning of life?' , config : { taskType : "SEMANTIC_SIMILARITY" , } }); console . log ( response . embeddings ); } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-embedding-exp-03-07:embedContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{"model": "models/gemini-embedding-exp-03-07", "content": { "parts":[{ "text": "What is the meaning of life?"}]}, "taskType": "SEMANTIC_SIMILARITY" }' Supported task types Task type Description SEMANTIC_SIMILARITY Used to generate embeddings that are optimized to assess text similarity. CLASSIFICATION Used to generate embeddings that are optimized to classify texts according to preset labels. CLUSTERING Used to generate embeddings that are optimized to cluster texts based on their similarities. RETRIEVAL_DOCUMENT , RETRIEVAL_QUERY , QUESTION_ANSWERING , and FACT_VERIFICATION Used to generate embeddings that are optimized for document search or information retrieval. CODE_RETRIEVAL_QUERY Used to retrieve a code block based on a natural language query, such as sort an array or reverse a linked list. Embeddings of the code blocks are computed using RETRIEVAL_DOCUMENT . Use cases Text embeddings \ No newline at end of file diff --git a/docstore/c5e16ca3-eccc-4522-818e-2e767db38a76 b/docstore/c5e16ca3-eccc-4522-818e-2e767db38a76 new file mode 100644 index 0000000000000000000000000000000000000000..a3a5b12622da56afcf66e8d09d0c1c7555dea0d4 --- /dev/null +++ b/docstore/c5e16ca3-eccc-4522-818e-2e767db38a76 @@ -0,0 +1 @@ +"gemini-live-2.5-flash-preview" tools = [{ 'google_search' : {}}] config = { "response_modalities" : [ "TEXT" ], "tools" : tools } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : prompt = "When did the last Brazil vs. Argentina soccer match happen?" await session . send_client_content ( turns = { "parts" : [{ "text" : prompt }]}) async for chunk in session . receive (): if chunk . server_content : if chunk . text is not None : print ( chunk . text ) # The model might generate and execute Python code to use Search model_turn = chunk . server_content . model_turn if model_turn : for part in model_turn . parts : if part . executable_code is not None : print ( part . executable_code . code ) if part . code_execution_result is not None : print ( part . code_execution_result . output ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const tools = [{ googleSearch : {}}] const config = { responseModalities : [ Modality . TEXT ], tools : tools } async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } else if ( message . toolCall ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : \ No newline at end of file diff --git a/docstore/c5e25e39-f12e-45c6-8561-04e8cc628a25 b/docstore/c5e25e39-f12e-45c6-8561-04e8cc628a25 new file mode 100644 index 0000000000000000000000000000000000000000..44112a5590578bd0d81bd7d1053c465e40ca11dd --- /dev/null +++ b/docstore/c5e25e39-f12e-45c6-8561-04e8cc628a25 @@ -0,0 +1 @@ +tokens for other Live API models Supported languages Live API supports the following languages. Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Language BCP-47 Code Language BCP-47 Code German (Germany) de-DE English (Australia)* en-AU English (UK)* en-GB English (India) en-IN English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Portuguese (Brazil) pt-BR Arabic (Generic) ar-XA Spanish (Spain)* es-ES French (Canada)* fr-CA Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Bengali (India) bn-IN Gujarati (India)* gu-IN Kannada (India)* kn-IN Marathi (India) mr-IN Malayalam (India)* ml-IN Tamil (India) ta-IN Telugu (India) te-IN Dutch (Netherlands) nl-NL Korean (South Korea) ko-KR Mandarin Chinese (China)* cmn-CN Polish (Poland) pl-PL Russian (Russia) ru-RU Thai (Thailand) th-TH Languages marked with an asterisk (*) are not available for Native audio . What's next Read the Tool Use and Session Management guides for essential information on using the Live API effectively. Try the Live API in Google AI Studio . For more info about the Live API models, see Gemini 2.0 Flash Live and Gemini 2.5 Flash Native Audio on the Models page. Try more examples in the Live API cookbook , the Live API Tools cookbook , and the Live API Get Started script . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/c5e783fd-b3bf-406f-9fff-f7f9364d00c4 b/docstore/c5e783fd-b3bf-406f-9fff-f7f9364d00c4 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/c5e783fd-b3bf-406f-9fff-f7f9364d00c4 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/c5ee7588-8031-40eb-927e-b942889bccf6 b/docstore/c5ee7588-8031-40eb-927e-b942889bccf6 new file mode 100644 index 0000000000000000000000000000000000000000..dc35916dc3478088071aa0ef0f9dbb66dbfa3387 --- /dev/null +++ b/docstore/c5ee7588-8031-40eb-927e-b942889bccf6 @@ -0,0 +1 @@ +response . arrayBuffer ()); const fileBlob = new Blob ([ pdfBuffer ], { type : 'application/pdf' }); const file = await ai . files . upload ({ file : fileBlob , config : { displayName : 'A17_FlightPlan.pdf' , }, }); // Wait for the file to be processed. let getFile = await ai . files . get ({ name : file . name }); while ( getFile . state === 'PROCESSING' ) { getFile = await ai . files . get ({ name : file . name }); console . log ( `current file status: ${ getFile . state } ` ); console . log ( 'File is still processing, retrying in 5 seconds' ); await new Promise (( resolve ) = > { setTimeout ( resolve , 5000 ); }); } if ( file . state === 'FAILED' ) { throw new Error ( 'File processing failed.' ); } // Add the file to the contents. const content = [ 'Summarize this document' , ]; if ( file . uri && file . mimeType ) { const fileContent = createPartFromUri ( file . uri , file . mimeType ); content . push ( fileContent ); } const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : content , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "io" "net/http" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , _ := genai . NewClient ( ctx , & genai . ClientConfig { APIKey : os . Getenv ( "GEMINI_API_KEY" ), Backend : genai . BackendGeminiAPI , }) pdfURL := "https://www.nasa.gov/wp-content/uploads/static/history/alsj/a17/A17_FlightPlan.pdf" localPdfPath := "A17_FlightPlan_downloaded.pdf" respHttp , _ := http . Get ( pdfURL ) defer respHttp . Body . Close () outFile , _ := os . Create ( localPdfPath ) defer outFile . Close () _ , _ = io . Copy ( outFile , respHttp . Body ) uploadConfig := & genai . UploadFileConfig { MIMEType : "application/pdf" } uploadedFile , _ := client . Files . UploadFromPath ( ctx , localPdfPath , uploadConfig ) promptParts := [] * genai . Part { genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), genai . \ No newline at end of file diff --git a/docstore/c5f036ef-a0c7-4c2a-abcc-1fe191a48bcf b/docstore/c5f036ef-a0c7-4c2a-abcc-1fe191a48bcf new file mode 100644 index 0000000000000000000000000000000000000000..b58c189db2b124c61872134bdd4b3d786a4e18e6 --- /dev/null +++ b/docstore/c5f036ef-a0c7-4c2a-abcc-1fe191a48bcf @@ -0,0 +1 @@ +you can read about configuring function calling . Python from google import genai from google.genai import types # Configure the client and tools client = genai . Client () house_tools = [ types . Tool ( function_declarations = [ power_disco_ball , start_music , dim_lights ]) ] config = types . GenerateContentConfig ( tools = house_tools , automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ), # Force the model to call 'any' function, instead of chatting. tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = 'ANY' ) ), ) chat = client . chats . create ( model = "gemini-2.5-flash" , config = config ) response = chat . send_message ( "Turn this place into a party!" ) # Print out each of the function calls requested from this single call print ( "Example 1: Forced function calling" ) for fn in response . function_calls : args = ", " . join ( f " { key } = { val } " for key , val in fn . args . items ()) print ( f " { fn . name } ( { args } )" ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Set up function declarations const houseFns = [ powerDiscoBall , startMusic , dimLights ]; const config = { tools : [{ functionDeclarations : houseFns }], // Force the model to call 'any' function, instead of chatting. toolConfig : { functionCallingConfig : { mode : 'any' } } }; // Configure the client const ai = new GoogleGenAI ({}); // Create a chat session const chat = ai . chats . create ({ model : 'gemini-2.5-flash' , config : config }); const response = await chat . sendMessage ({ message : 'Turn this place into a party!' }); // Print out each of the function calls requested from this single call console . log ( "Example 1: Forced function calling" ); for ( const fn of response . functionCalls ) { const args = Object . entries ( fn . args ) . map (([ key , val ]) = > ` ${ key } = ${ val } ` ) . join ( ', ' ); console . log ( ` ${ fn . name } ( ${ args } )` ); } Each of the printed \ No newline at end of file diff --git a/docstore/c5f4557c-8f06-48f4-a929-b7c9732499ae b/docstore/c5f4557c-8f06-48f4-a929-b7c9732499ae new file mode 100644 index 0000000000000000000000000000000000000000..b362fdd58ed7301c466f0b3a048e65a061fc1b90 --- /dev/null +++ b/docstore/c5f4557c-8f06-48f4-a929-b7c9732499ae @@ -0,0 +1 @@ +"messages": [ {"role": "user", "content": "Explain to me how AI works"} ] }' Gemini thinking models also produce thought summaries and can use exact thinking budgets . You can use the extra_body field to include these fields in your request. Note that reasoning_effort and thinking_budget overlap functionality, so they can't be used at the same time. Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.5-flash" , messages = [{ "role" : "user" , "content" : "Explain to me how AI works" }], extra_body = { 'extra_body' : { "google" : { "thinking_config" : { "thinking_budget" : 800 , "include_thoughts" : True } } } } ) print ( response . choices [ 0 ] . message ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); const response = await openai . chat . completions . create ({ model : "gemini-2.5-flash" , messages : [{ role : "user" , content : "Explain to me how AI works" ,}], extra_body : { "google" : { "thinking_config" : { "thinking_budget" : 800 , "include_thoughts" : true } } } }); console . log ( response . choices [ 0 ]. message ); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.5-flash", "messages": [{"role": "user", "content": "Explain to me how AI works"}], "extra_body": { "google": { "thinking_config": { "include_thoughts": true } } } }' Streaming The Gemini API supports streaming responses . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = [ { \ No newline at end of file diff --git a/docstore/c5fe8c18-7df7-4d5a-b13e-7581e112b4fd b/docstore/c5fe8c18-7df7-4d5a-b13e-7581e112b4fd new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/c5fe8c18-7df7-4d5a-b13e-7581e112b4fd @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/c5ff8c49-862c-43c4-bed2-edcccdc06f77 b/docstore/c5ff8c49-862c-43c4-bed2-edcccdc06f77 new file mode 100644 index 0000000000000000000000000000000000000000..2c0be12e5de5e086f593ed916f2fdecc774a45a4 --- /dev/null +++ b/docstore/c5ff8c49-862c-43c4-bed2-edcccdc06f77 @@ -0,0 +1 @@ +Text generation | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Text generation The Gemini API can generate text output from various inputs, including text, images, video, and audio, leveraging Gemini models. Here's a basic example that takes a single text input: Python from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents \ No newline at end of file diff --git a/docstore/c638021b-6a7a-4f7f-905b-30feaf4ef111 b/docstore/c638021b-6a7a-4f7f-905b-30feaf4ef111 new file mode 100644 index 0000000000000000000000000000000000000000..bb8a5c387274cea53762666893b97a549fe37a5c --- /dev/null +++ b/docstore/c638021b-6a7a-4f7f-905b-30feaf4ef111 @@ -0,0 +1 @@ +Client () response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = "Provide a list of 3 famous physicists and their key contributions" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 1024 ) # Turn off thinking: # thinking_config=types.ThinkingConfig(thinking_budget=0) # Turn on dynamic thinking: # thinking_config=types.ThinkingConfig(thinking_budget=-1) ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : "Provide a list of 3 famous physicists and their key contributions" , config : { thinkingConfig : { thinkingBudget : 1024 , // Turn off thinking: // thinkingBudget: 0 // Turn on dynamic thinking: // thinkingBudget: -1 }, }, }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "google.golang.org/genai" "os" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } thinkingBudgetVal := int32 ( 1024 ) contents := genai . Text ( "Provide a list of 3 famous physicists and their key contributions" ) model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : & thinkingBudgetVal , // Turn off thinking: // ThinkingBudget: int32(0), // Turn on dynamic thinking: // ThinkingBudget: int32(-1), }, }) fmt . Println ( resp . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Provide a list of 3 famous physicists and their key contributions" } ] } ], "generationConfig": { \ No newline at end of file diff --git a/docstore/c6396cb5-49c9-4b58-8044-99730b891bfd b/docstore/c6396cb5-49c9-4b58-8044-99730b891bfd new file mode 100644 index 0000000000000000000000000000000000000000..80d5712c34eded00f9773f9aab36a5ebeb2605a2 --- /dev/null +++ b/docstore/c6396cb5-49c9-4b58-8044-99730b891bfd @@ -0,0 +1 @@ +descriptive phrases which you've found perform safely in your application context. Blocking unsafe inputs and filtering output before it is shown to the user. In simple situations, blocklists can be used to identify and block unsafe words or phrases in prompts or responses, or require human reviewers to manually alter or block such content. Note: Automatically blocking based on a static list can have unintended results such as targeting a particular group that commonly uses vocabulary in the blocklist. Using trained classifiers to label each prompt with potential harms or adversarial signals. Different strategies can then be employed on how to handle the request based on the type of harm detected. For example, If the input is overtly adversarial or abusive in nature, it could be blocked and instead output a pre-scripted response. Advanced tip If signals determine the output to be harmful, the application can employ the following options: Provide an error message or pre-scripted output. Try the prompt again, in case an alternative safe output is generated, since sometimes the same prompt will elicit different outputs. Putting safeguards in place against deliberate misuse such as assigning each user a unique ID and imposing a limit on the volume of user queries that can be submitted in a given period. Another safeguard is to try and protect against possible prompt injection. Prompt injection, much like SQL injection, is a way for malicious users to design an input prompt that manipulates the output of the model, for example, by sending an input prompt that instructs the model to ignore any previous examples. See the Generative AI Prohibited Use Policy for details about deliberate misuse. Adjusting functionality to something that is inherently lower risk. Tasks that are narrower in scope (e.g., extracting keywords from passages of text) or that have greater human oversight (e.g., generating short-form content that will be reviewed by a human), often pose a lower risk. \ No newline at end of file diff --git a/docstore/c65b2cc6-0606-4200-b8bc-48df0384b4de b/docstore/c65b2cc6-0606-4200-b8bc-48df0384b4de new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/c65b2cc6-0606-4200-b8bc-48df0384b4de @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/c680af3b-8334-428c-84d6-e1d234a8154a b/docstore/c680af3b-8334-428c-84d6-e1d234a8154a new file mode 100644 index 0000000000000000000000000000000000000000..6ba79c2c7031e707d89e2b23470a5d9ee375cc5c --- /dev/null +++ b/docstore/c680af3b-8334-428c-84d6-e1d234a8154a @@ -0,0 +1 @@ +mcp.client.stdio import stdio_client from google import genai client = genai . Client () # Create server parameters for stdio connection server_params = StdioServerParameters ( command = "npx" , # Executable args = [ "-y" , "@philschmid/weather-mcp" ], # MCP Server env = None , # Optional environment variables ) async def run (): async with stdio_client ( server_params ) as ( read , write ): async with ClientSession ( read , write ) as session : # Prompt to get the weather for the current day in London. prompt = f "What is the weather in London in { datetime . now () . strftime ( '%Y-%m- %d ' ) } ?" # Initialize the connection between client and server await session . initialize () # Send request to the model with MCP function declarations response = await client . aio . models . generate_content ( model = "gemini-2.5-flash" , contents = prompt , config = genai . types . GenerateContentConfig ( temperature = 0 , tools = [ session ], # uses the session, will automatically call the tool # Uncomment if you **don't** want the SDK to automatically call the tool # automatic_function_calling=genai.types.AutomaticFunctionCallingConfig( # disable=True # ), ), ) print ( response . text ) # Start the asyncio event loop and run the main function asyncio . run ( run ()) JavaScript Make sure the latest version of the mcp SDK is installed on your platform of choice. npm install @modelcontextprotocol/sdk Note: JavaScript supports automatic tool calling by wrapping the client with mcpToTool . If you want to disable it, you can provide automaticFunctionCalling with disabled true . import { GoogleGenAI , FunctionCallingConfigMode , mcpToTool } from '@google/genai' ; import { Client } from "@modelcontextprotocol/sdk/client/index.js" ; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js" ; // Create server parameters for stdio connection const serverParams = new StdioClientTransport ({ command : "npx" , // Executable args : [ "-y" , "@philschmid/weather-mcp" \ No newline at end of file diff --git a/docstore/c68c3dd0-69c6-456a-a5b3-102aac68f1a2 b/docstore/c68c3dd0-69c6-456a-a5b3-102aac68f1a2 new file mode 100644 index 0000000000000000000000000000000000000000..954b7c2f84cbdb4650dd5497451af99dc34fb5db --- /dev/null +++ b/docstore/c68c3dd0-69c6-456a-a5b3-102aac68f1a2 @@ -0,0 +1 @@ +GoogleGenAI ({}); let response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and make sure you get all 50." , ], config : { tools : [{ codeExecution : {} }], }, }); const parts = response ? . candidates ? .[ 0 ] ? . content ? . parts || []; parts . forEach (( part ) = > { if ( part . text ) { console . log ( part . text ); } if ( part . executableCode && part . executableCode . code ) { console . log ( part . executableCode . code ); } if ( part . codeExecutionResult && part . codeExecutionResult . output ) { console . log ( part . codeExecutionResult . output ); } }); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { Tools : [] * genai . Tool { { CodeExecution : & genai . ToolCodeExecution {}}, }, } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and make sure you get all 50." ), config , ) fmt . Println ( result . Text ()) fmt . Println ( result . ExecutableCode ()) fmt . Println ( result . CodeExecutionResult ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d ' {"tools": [{"code_execution": {}}], "contents": { "parts": { "text": "What is the sum of the first 50 prime numbers? Generate and run code for the calculation, and make sure you get all 50." } }, }' Note: This REST example doesn't parse the JSON response as shown in the example output. The output might look something like the following, which has been formatted for readability: Okay, I need to calculate \ No newline at end of file diff --git a/docstore/c68c6485-e7e1-4ae6-871c-b2465a6e7574 b/docstore/c68c6485-e7e1-4ae6-871c-b2465a6e7574 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/c68c6485-e7e1-4ae6-871c-b2465a6e7574 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/c69aefd8-56e5-4ee5-b823-1d19855819c8 b/docstore/c69aefd8-56e5-4ee5-b823-1d19855819c8 new file mode 100644 index 0000000000000000000000000000000000000000..a07288c87291962aa181765ea596a2b1afe9ed3f --- /dev/null +++ b/docstore/c69aefd8-56e5-4ee5-b823-1d19855819c8 @@ -0,0 +1 @@ +Model Function Calling Parallel Function Calling Compositional Function Calling Gemini 2.5 Pro ✔️ ✔️ ✔️ Gemini 2.5 Flash ✔️ ✔️ ✔️ Gemini 2.5 Flash-Lite ✔️ ✔️ ✔️ Gemini 2.0 Flash ✔️ ✔️ ✔️ Gemini 2.0 Flash-Lite X X X Best practices Function and Parameter Descriptions: Be extremely clear and specific in your descriptions. The model relies on these to choose the correct function and provide appropriate arguments. Naming: Use descriptive function names (without spaces, periods, or dashes). Strong Typing: Use specific types (integer, string, enum) for parameters to reduce errors. If a parameter has a limited set of valid values, use an enum. Tool Selection: While the model can use an arbitrary number of tools, providing too many can increase the risk of selecting an incorrect or suboptimal tool. For best results, aim to provide only the relevant tools for the context or task, ideally keeping the active set to a maximum of 10-20. Consider dynamic tool selection based on conversation context if you have a large total number of tools. Prompt Engineering: Provide context: Tell the model its role (e.g., "You are a helpful weather assistant."). Give instructions: Specify how and when to use functions (e.g., "Don't guess dates; always use a future date for forecasts."). Encourage clarification: Instruct the model to ask clarifying questions if needed. Temperature: Use a low temperature (e.g., 0) for more deterministic and reliable function calls. Validation: If a function call has significant consequences (e.g., placing an order), validate the call with the user before executing it. Error Handling : Implement robust error handling in your functions to gracefully handle unexpected inputs or API failures. Return informative error messages that the model can use to generate helpful responses to the user. Security: Be mindful of security when calling external APIs. Use appropriate authentication and authorization mechanisms. Avoid exposing sensitive data in function calls. Token \ No newline at end of file diff --git a/docstore/c6bc9d32-9593-4c2b-ba5c-c9f20c8be9fb b/docstore/c6bc9d32-9593-4c2b-ba5c-c9f20c8be9fb new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/c6bc9d32-9593-4c2b-ba5c-c9f20c8be9fb @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/c6c1ed2f-3d3e-411a-93fc-2cb3134fe1f7 b/docstore/c6c1ed2f-3d3e-411a-93fc-2cb3134fe1f7 new file mode 100644 index 0000000000000000000000000000000000000000..4b3d79fcf31020903c40df052c1807fd4a690d51 --- /dev/null +++ b/docstore/c6c1ed2f-3d3e-411a-93fc-2cb3134fe1f7 @@ -0,0 +1 @@ +like photography descriptors, shapes and materials, historical art movements, and image quality modifiers. Photography Prompt includes: "A photo of..." To use this style, start with using keywords that clearly tell Imagen that you're looking for a photograph. Start your prompts with "A photo of. . ." . For example: Prompt: A photo of coffee beans in a kitchen on a wooden surface Prompt: A photo of a chocolate bar on a kitchen counter Prompt: A photo of a modern building with water in the background Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Photography modifiers In the following examples, you can see several photography-specific modifiers and parameters. You can combine multiple modifiers for more precise control. Camera Proximity - Close up, taken from far away Prompt: A close-up photo of coffee beans Prompt: A zoomed out photo of a small bag of coffee beans in a messy kitchen Camera Position - aerial, from below Prompt: aerial photo of urban city with skyscrapers Prompt: A photo of a forest canopy with blue skies from below Lighting - natural, dramatic, warm, cold Prompt: studio photo of a modern arm chair, natural lighting Prompt: studio photo of a modern arm chair, dramatic lighting Camera Settings - motion blur, soft focus, bokeh, portrait Prompt: photo of a city with skyscrapers from the inside of a car with motion blur Prompt: soft focus photograph of a bridge in an urban city at night Lens types - 35mm, 50mm, fisheye, wide angle, macro Prompt: photo of a leaf, macro lens Prompt: street photography, new york city, fisheye lens Film types - black and white, polaroid Prompt: a polaroid portrait of a dog wearing sunglasses Prompt: black and white photo of a dog wearing sunglasses Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Illustration and art Prompt includes: "A painting of..." , "A sketch of..." Art styles vary from monochrome styles like pencil \ No newline at end of file diff --git a/docstore/c6c54543-3870-4ac6-8b51-3f5b79e88bd3 b/docstore/c6c54543-3870-4ac6-8b51-3f5b79e88bd3 new file mode 100644 index 0000000000000000000000000000000000000000..0a87c36dd70c54ff062f2d96388cf5344fb768a1 --- /dev/null +++ b/docstore/c6c54543-3870-4ac6-8b51-3f5b79e88bd3 @@ -0,0 +1 @@ +process the response and check for Function Call, if Yes : Extract the name and args of the function and execute the corresponding function in your application. No: The model has provided a direct text response to the prompt (this flow is less emphasized in the example but is a possible outcome). Create User friendly response: If a function was executed, capture the result and send it back to the model in a subsequent turn of the conversation. It will use the result to generate a final, user-friendly response that incorporates the information from the function call. This process can be repeated over multiple turns, allowing for complex interactions and workflows. The model also supports calling multiple functions in a single turn ( parallel function calling ) and in sequence ( compositional function calling ). Step 1: Define a function declaration Define a function and its declaration within your application code that allows users to set light values and make an API request. This function could call external services or APIs. Python # Define a function that the model can call to control smart lights set_light_values_declaration = { "name" : "set_light_values" , "description" : "Sets the brightness and color temperature of a light." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "integer" , "description" : "Light level from 0 to 100. Zero is off and 100 is full brightness" , }, "color_temp" : { "type" : "string" , "enum" : [ "daylight" , "cool" , "warm" ], "description" : "Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`." , }, }, "required" : [ "brightness" , "color_temp" ], }, } # This is the actual function that would be called based on the model's suggestion def set_light_values ( brightness : int , color_temp : str ) - > dict [ str , int | str ]: """Set the brightness and color temperature of a room light. (mock API). Args: brightness: Light level from 0 to 100. Zero is off and 100 is full \ No newline at end of file diff --git a/docstore/c6c5bccd-4fe6-4ca3-af7b-3de6444237e9 b/docstore/c6c5bccd-4fe6-4ca3-af7b-3de6444237e9 new file mode 100644 index 0000000000000000000000000000000000000000..0d39ec227c9e73c3d3b5117917b80f593b500b4b --- /dev/null +++ b/docstore/c6c5bccd-4fe6-4ca3-af7b-3de6444237e9 @@ -0,0 +1 @@ +async function main () { const ai = new GoogleGenAI ({}); const imageUrl = "https://goo.gle/instrument-img" ; const response = await fetch ( imageUrl ); const imageArrayBuffer = await response . arrayBuffer (); const base64ImageData = Buffer . from ( imageArrayBuffer ). toString ( 'base64' ); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ { inlineData : { mimeType : 'image/jpeg' , data : base64ImageData , }, }, { text : "Caption this image." } ], }); console . log ( result . text ); } main (); Go package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Download the image. imageResp , _ := http . Get ( "https://goo.gle/instrument-img" ) imageBytes , _ := io . ReadAll ( imageResp . Body ) parts := [] * genai . Part { genai . NewPartFromBytes ( imageBytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMG_URL = "https://goo.gle/instrument-img" MIME_TYPE = $( curl -sIL " $IMG_URL " | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1 ) if [[ -z " $MIME_TYPE " || ! " $MIME_TYPE " == image/* ]] ; then MIME_TYPE = "image/jpeg" fi # Check for macOS if [[ " $( uname ) " == "Darwin" ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -b 0 ) elif [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 ) else IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -w0 ) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ \ No newline at end of file diff --git a/docstore/c6cc6629-0210-4566-b093-db1192a108cc b/docstore/c6cc6629-0210-4566-b093-db1192a108cc new file mode 100644 index 0000000000000000000000000000000000000000..4433c8232d2fd05b9dc895143c041f280f0af769 --- /dev/null +++ b/docstore/c6cc6629-0210-4566-b093-db1192a108cc @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/live-tools Title: Tool use with Live API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/c6d1527e-39b3-49f4-b095-eca4e7242eee b/docstore/c6d1527e-39b3-49f4-b095-eca4e7242eee new file mode 100644 index 0000000000000000000000000000000000000000..989a38b805ed3662f352ccf72b45824dd12e3417 --- /dev/null +++ b/docstore/c6d1527e-39b3-49f4-b095-eca4e7242eee @@ -0,0 +1 @@ +Tool use with Live API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Tool use with Live API Tool use allows Live API to go beyond just conversation by enabling it to perform actions in the real-world and pull in external context while maintaining a real time connection. You can define tools such as Function calling , Code execution , and Google Search with the Live API. Overview of supported tools Here's a brief overview of the available tools for each model: Tool Cascaded models gemini-live-2.5-flash-preview gemini-2.0-flash-live-001 gemini-2.5-flash-preview-native-audio-dialog gemini-2.5-flash-exp-native-audio-thinking-dialog Search Yes Yes Yes Function calling Yes Yes No Code execution Yes No No Url context Yes No No Function calling Live API supports function calling, just like regular content generation requests. Function calling lets the Live API interact with external data and programs, greatly increasing what your applications can accomplish. You can define function declarations as part of the session configuration. After receiving tool calls, the client should respond with a list of FunctionResponse objects using the session.send_tool_response method. See the Function calling tutorial to learn more. Note: Unlike the generateContent API, the Live API doesn't support automatic tool response handling. You must handle tool responses manually in your client code. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" # Simple function definitions turn_on_the_lights = { "name" : "turn_on_the_lights" } turn_off_the_lights = { "name" : \ No newline at end of file diff --git a/docstore/c6efe4ec-9b08-4821-a7f1-54b75e1ff60c b/docstore/c6efe4ec-9b08-4821-a7f1-54b75e1ff60c new file mode 100644 index 0000000000000000000000000000000000000000..465170b0f695eefaa56d7f3871f4f3df2424da62 --- /dev/null +++ b/docstore/c6efe4ec-9b08-4821-a7f1-54b75e1ff60c @@ -0,0 +1 @@ +brightness color_temp: Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. Returns: A dictionary containing the set brightness and color temperature. """ return { "brightness" : brightness , "colorTemperature" : color_temp } JavaScript import { Type } from '@google/genai' ; // Define a function that the model can call to control smart lights const setLightValuesFunctionDeclaration = { name : 'set_light_values' , description : 'Sets the brightness and color temperature of a light.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'Light level from 0 to 100. Zero is off and 100 is full brightness' , }, color_temp : { type : Type . STRING , enum : [ 'daylight' , 'cool' , 'warm' ], description : 'Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`.' , }, }, required : [ 'brightness' , 'color_temp' ], }, }; /** * Set the brightness and color temperature of a room light. (mock API) * @param {number} brightness - Light level from 0 to 100. Zero is off and 100 is full brightness * @param {string} color_temp - Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. * @return {Object} A dictionary containing the set brightness and color temperature. */ function setLightValues ( brightness , color_temp ) { return { brightness : brightness , colorTemperature : color_temp }; } Step 2: Call the model with function declarations Once you have defined your function declarations, you can prompt the model to use them. It analyzes the prompt and function declarations and decides whether to respond directly or to call a function. If a function is called, the response object will contain a function call suggestion. Python from google.genai import types # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ set_light_values_declaration ]) config = types . GenerateContentConfig ( tools = [ \ No newline at end of file diff --git a/docstore/c708c279-31a6-44af-8457-da4c43506d15 b/docstore/c708c279-31a6-44af-8457-da4c43506d15 new file mode 100644 index 0000000000000000000000000000000000000000..11f27b5229ca0eb555eb4424519c1aeb19a4be6c --- /dev/null +++ b/docstore/c708c279-31a6-44af-8457-da4c43506d15 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/quickstart#main-content Title: Gemini API quickstart | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/c74f8ff3-0788-4f81-92da-4d5376677d8c b/docstore/c74f8ff3-0788-4f81-92da-4d5376677d8c new file mode 100644 index 0000000000000000000000000000000000000000..8450328bfc49bb62b82cf9cb2ac9ca979a8f4eef --- /dev/null +++ b/docstore/c74f8ff3-0788-4f81-92da-4d5376677d8c @@ -0,0 +1 @@ +Python import asyncio import wave from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : wf = wave . open ( "audio.wav" , "wb" ) wf . setnchannels ( 1 ) wf . setsampwidth ( 2 ) wf . setframerate ( 24000 ) message = "Hello how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . data is not None : wf . writeframes ( response . data ) # Un-comment this code to print audio data info # if response.server_content.model_turn is not None: # print(response.server_content.model_turn.parts[0].inline_data.mime_type) wf . close () if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : \ No newline at end of file diff --git a/docstore/c75a309c-2cbc-4d03-997f-179bf0a1757c b/docstore/c75a309c-2cbc-4d03-997f-179bf0a1757c new file mode 100644 index 0000000000000000000000000000000000000000..bdac38d149644927b7d126aee3930efd52a696eb --- /dev/null +++ b/docstore/c75a309c-2cbc-4d03-997f-179bf0a1757c @@ -0,0 +1 @@ +turn_off_the_lights_schema = { 'name' : 'turn_off_the_lights' } prompt = """ Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? """ tools = [ { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} ] await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Light control schemas const turnOnTheLightsSchema = { name : 'turn_on_the_lights' }; const turnOffTheLightsSchema = { name : 'turn_off_the_lights' }; const prompt = ` Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? ` ; const tools = [ { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } ]; await run ( prompt , tools = tools , modality = "AUDIO" ) Function calling modes The Gemini API lets you control how the model uses the provided tools (function declarations). Specifically, you can set the mode within the. function_calling_config . AUTO (Default) : The model decides whether to generate a natural language response or suggest a function call based on the prompt and context. This is the most flexible mode and recommended for most scenarios. ANY : The model is constrained to always predict a function call and guarantees function schema adherence. If allowed_function_names is not specified, the model can choose from any of the provided function declarations. If allowed_function_names is provided as a list, the model can only choose from the functions in that list. Use this mode when you require a function call response to every prompt (if applicable). NONE : The model is prohibited from making function calls. This is equivalent to sending a request without any function declarations. Use this to temporarily disable function calling without removing your tool definitions. Python from google.genai import types # Configure function calling mode tool_config = types . ToolConfig ( \ No newline at end of file diff --git a/docstore/c75d9555-6cb1-4a59-bbd5-70d9c34e0499 b/docstore/c75d9555-6cb1-4a59-bbd5-70d9c34e0499 new file mode 100644 index 0000000000000000000000000000000000000000..a93f111d87cca3d0226fc4cdf96775703aa03950 --- /dev/null +++ b/docstore/c75d9555-6cb1-4a59-bbd5-70d9c34e0499 @@ -0,0 +1 @@ +genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMAGE_PATH = "path/to/sample.jpg" MIME_TYPE = $( file -b --mime-type " ${ IMAGE_PATH } " ) NUM_BYTES = $( wc -c < " ${ IMAGE_PATH } " ) DISPLAY_NAME = IMAGE tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Caption this image."}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Prompting with multiple images You can provide multiple images in a \ No newline at end of file diff --git a/docstore/c787019f-da26-4ec7-8388-d95014786a87 b/docstore/c787019f-da26-4ec7-8388-d95014786a87 new file mode 100644 index 0000000000000000000000000000000000000000..97dbf55e65dff08953b6d96f3c29b5e9a98ff448 --- /dev/null +++ b/docstore/c787019f-da26-4ec7-8388-d95014786a87 @@ -0,0 +1,3 @@ +URL: https://ai.google.dev/gemini-api/docs/model-tuning#main-content Title: Fine-tuning with the Gemini API | Google AI for Developers ================================================== + +Fine-tuning with the Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Fine-tuning with the Gemini API With the deprecation of Gemini 1.5 Flash-001 in May 2025, we no longer have a model available which supports fine-tuning in the Gemini API, but it is supported in Vertex AI . We plan to bring fine-tuning support back in the future. We would love to hear from you on our developer forum if fine-tuning is important to your use case. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-30 UTC. \ No newline at end of file diff --git a/docstore/c791f196-a0f5-4970-9e86-5003cf9c6c4e b/docstore/c791f196-a0f5-4970-9e86-5003cf9c6c4e new file mode 100644 index 0000000000000000000000000000000000000000..2e8ac78d46cd543ec267658a40dc30e0a2feb077 --- /dev/null +++ b/docstore/c791f196-a0f5-4970-9e86-5003cf9c6c4e @@ -0,0 +1 @@ +Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { SystemInstruction : genai . NewContentFromText ( "You are a cat. Your name is Neko." , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Hello there" ), config , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "system_instruction": { "parts": [ { "text": "You are a cat. Your name is Neko." } ] }, "contents": [ { "parts": [ { "text": "Hello there" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const systemInstruction = { parts : [{ text : 'You are a cat. Your name is Neko.' }] }; const payload = { systemInstruction , contents : [ { parts : [ { text : 'Hello there' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } The GenerateContentConfig object also lets you override default generation parameters, such as temperature . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model \ No newline at end of file diff --git a/docstore/c7af23ce-2274-48b6-8d76-956837ea6fc1 b/docstore/c7af23ce-2274-48b6-8d76-956837ea6fc1 new file mode 100644 index 0000000000000000000000000000000000000000..b0033fbc695240330e92f0eacef1e843c48482b9 --- /dev/null +++ b/docstore/c7af23ce-2274-48b6-8d76-956837ea6fc1 @@ -0,0 +1 @@ +Supported values are "16:9" and "9:16" . The default is "16:9" . personGeneration : Allow the model to generate videos of people. The following values are supported: Text-to-video generation: "dont_allow" : Don't allow the inclusion of people or faces. "allow_adult" : Generate videos that include adults, but not children. "allow_all" : Generate videos that include adults and children. Image-to-video generation: "dont_allow" : Don't allow the inclusion of people or faces. "allow_adult" : Generate videos that include adults, but not children. See Limitations . numberOfVideos : Output videos requested, either 1 or 2 . durationSeconds : Length of each output video in seconds, between 5 and 8 . enhance_prompt : Enable or disable the prompt rewriter. Enabled by default. Specifications Modalities Text-to-video generation Image-to-video generation Request latency Min: 11 seconds Max: 6 minutes (during peak hours) Variable length generation 5-8 seconds Resolution 720p Frame rate 24fps Aspect ratio 16:9 - landscape 9:16 - portrait Input languages (text-to-video) English Limitations Image-to-video personGeneration is not allowed in EU, UK, CH, MENA locations Text-to-video personGeneration: "allow_all" is not allowed in EU, UK, CH, MENA locations Note: Check out the Models , Pricing , and Rate limits pages for more usage limitations for Veo. Videos created by Veo are watermarked using SynthID , our tool for watermarking and identifying AI-generated content, and are passed through safety filters and memorization checking processes that help mitigate privacy, copyright and bias risks. Things to try To get the most out of Veo, incorporate video-specific terminology into your prompts. Veo understands a wide range of terms related to: Shot composition: Specify the framing and number of subjects in the shot (e.g., "single shot," "two shot," "over-the-shoulder shot"). Camera positioning and movement: Control the camera's location and movement using terms like "eye level," "high \ No newline at end of file diff --git a/docstore/c7f8313c-2940-4dd7-aa2f-ab8c03b8af87 b/docstore/c7f8313c-2940-4dd7-aa2f-ab8c03b8af87 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/c7f8313c-2940-4dd7-aa2f-ab8c03b8af87 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/c8137465-d25d-4926-867c-7e254683e709 b/docstore/c8137465-d25d-4926-867c-7e254683e709 new file mode 100644 index 0000000000000000000000000000000000000000..7645b864913317d4ec923e00d51796055880e22d --- /dev/null +++ b/docstore/c8137465-d25d-4926-867c-7e254683e709 @@ -0,0 +1 @@ +https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:batchGenerateContent \ -X POST \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type:application/json" \ -d "{ 'batch': { 'display_name': 'my-batch-requests', 'input_config': { 'requests': { 'file_name': ${ BATCH_INPUT_FILE } } } } }" When you create a batch job, you will get a job name returned. Use this name for monitoring the job status as well as retrieving the results once the job completes. The following is an example output that contains a job name: Created batch job from file: batches/123456789 Monitoring job status Use the operation name obtained when creating the batch job to poll its status. The state field of the batch job will indicate its current status. A batch job can be in one of the following states: JOB_STATE_PENDING : The job has been created and is waiting to be processed by the service. JOB_STATE_SUCCEEDED : The job completed successfully. You can now retrieve the results. JOB_STATE_FAILED : The job failed. Check the error details for more information. JOB_STATE_CANCELLED : The job was cancelled by the user. You can poll the job status periodically to check for completion. Python # Use the name of the job you want to check # e.g., inline_batch_job.name from the previous step job_name = "YOUR_BATCH_JOB_NAME" # (e.g. 'batches/your-batch-id') batch_job = client . batches . get ( name = job_name ) completed_states = set ([ 'JOB_STATE_SUCCEEDED' , 'JOB_STATE_FAILED' , 'JOB_STATE_CANCELLED' , ]) print ( f "Polling status for job: { job_name } " ) batch_job = client . batches . get ( name = job_name ) # Initial get while batch_job . state . name not in completed_states : print ( f "Current state: { batch_job . state . name } " ) time . sleep ( 30 ) # Wait for 30 seconds before polling again batch_job = client . batches . get ( name = job_name ) print ( f "Job finished with state: { batch_job . state . name } " ) if batch_job . state . name == 'JOB_STATE_FAILED' : print ( f \ No newline at end of file diff --git a/docstore/c8167db4-cfc8-4038-aea9-99d45376dd80 b/docstore/c8167db4-cfc8-4038-aea9-99d45376dd80 new file mode 100644 index 0000000000000000000000000000000000000000..097e48b20f2cbfa1b05db2a0f80e7f3c1583707a --- /dev/null +++ b/docstore/c8167db4-cfc8-4038-aea9-99d45376dd80 @@ -0,0 +1 @@ +Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Gemini Developer API Get a Gemini API Key Get a Gemini API key and make your first API request in minutes. Python from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Explain how AI works in a few words" , ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works in a few words" , }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil )) if err != nil { log . Fatal ( err ) } result , err := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) if err != nil { log . Fatal ( err ) } fmt . Println ( result . Text ()) } Java package com.example ; import com.google.genai.Client ; import com.google.genai.types.GenerateContentResponse ; public class GenerateTextFromTextInput { public static void main ( String [] args ) { Client client = new Client (); GenerateContentResponse response = client . models . generateContent ( "gemini-2.5-flash" , "Explain how AI works in a few words" , null ); System . out . println ( response . text ()); } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H \ No newline at end of file diff --git a/docstore/c818b864-6778-4141-96d9-07721b48db89 b/docstore/c818b864-6778-4141-96d9-07721b48db89 new file mode 100644 index 0000000000000000000000000000000000000000..6ba79c2c7031e707d89e2b23470a5d9ee375cc5c --- /dev/null +++ b/docstore/c818b864-6778-4141-96d9-07721b48db89 @@ -0,0 +1 @@ +mcp.client.stdio import stdio_client from google import genai client = genai . Client () # Create server parameters for stdio connection server_params = StdioServerParameters ( command = "npx" , # Executable args = [ "-y" , "@philschmid/weather-mcp" ], # MCP Server env = None , # Optional environment variables ) async def run (): async with stdio_client ( server_params ) as ( read , write ): async with ClientSession ( read , write ) as session : # Prompt to get the weather for the current day in London. prompt = f "What is the weather in London in { datetime . now () . strftime ( '%Y-%m- %d ' ) } ?" # Initialize the connection between client and server await session . initialize () # Send request to the model with MCP function declarations response = await client . aio . models . generate_content ( model = "gemini-2.5-flash" , contents = prompt , config = genai . types . GenerateContentConfig ( temperature = 0 , tools = [ session ], # uses the session, will automatically call the tool # Uncomment if you **don't** want the SDK to automatically call the tool # automatic_function_calling=genai.types.AutomaticFunctionCallingConfig( # disable=True # ), ), ) print ( response . text ) # Start the asyncio event loop and run the main function asyncio . run ( run ()) JavaScript Make sure the latest version of the mcp SDK is installed on your platform of choice. npm install @modelcontextprotocol/sdk Note: JavaScript supports automatic tool calling by wrapping the client with mcpToTool . If you want to disable it, you can provide automaticFunctionCalling with disabled true . import { GoogleGenAI , FunctionCallingConfigMode , mcpToTool } from '@google/genai' ; import { Client } from "@modelcontextprotocol/sdk/client/index.js" ; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js" ; // Create server parameters for stdio connection const serverParams = new StdioClientTransport ({ command : "npx" , // Executable args : [ "-y" , "@philschmid/weather-mcp" \ No newline at end of file diff --git a/docstore/c83522d3-7a70-482a-9527-e1a263c7b13e b/docstore/c83522d3-7a70-482a-9527-e1a263c7b13e new file mode 100644 index 0000000000000000000000000000000000000000..1b38806cd93f51329872622740eee5221d7a7d7e --- /dev/null +++ b/docstore/c83522d3-7a70-482a-9527-e1a263c7b13e @@ -0,0 +1 @@ +18°C." , config = config , ) # Print the final, user-facing response print ( response . text ) Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . I 've set the thermostat to 20°C. JavaScript This example shows how to use JavaScript/TypeScript SDK to do comopositional function calling using a manual execution loop. import { GoogleGenAI , Type } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Example Functions function get_weather_forecast ({ location }) { console . log ( `Tool Call: get_weather_forecast(location= ${ location } )` ); // TODO: Make API call console . log ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ); return { temperature : 25 , unit : "celsius" }; } function set_thermostat_temperature ({ temperature }) { console . log ( `Tool Call: set_thermostat_temperature(temperature= ${ temperature } )` , ); // TODO: Make API call console . log ( "Tool Response: {'status': 'success'}" ); return { status : "success" }; } const toolFunctions = { get_weather_forecast , set_thermostat_temperature , }; const tools = [ { functionDeclarations : [ { name : "get_weather_forecast" , description : "Gets the current weather temperature for a given location." , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , }, }, required : [ "location" ], }, }, { name : "set_thermostat_temperature" , description : "Sets the thermostat to a desired temperature." , parameters : { type : Type . OBJECT , properties : { temperature : { type : Type . NUMBER , }, }, required : [ \ No newline at end of file diff --git a/docstore/c8537331-55a5-404b-8f5d-5fcc7f3fb065 b/docstore/c8537331-55a5-404b-8f5d-5fcc7f3fb065 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/c8537331-55a5-404b-8f5d-5fcc7f3fb065 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/c859cc15-123a-4ec4-8aab-52db1e32e97f b/docstore/c859cc15-123a-4ec4-8aab-52db1e32e97f new file mode 100644 index 0000000000000000000000000000000000000000..1540f812946831d7975c774be202eaa83cf52504 --- /dev/null +++ b/docstore/c859cc15-123a-4ec4-8aab-52db1e32e97f @@ -0,0 +1 @@ +"CiwBVKhc7nRyTi3HmggPD9iQiRc261f5jwuMdw3H/itDH0emsb9ZVo3Nwx9p6wpsAVSoXO5i8fDV4jBSBLoaWxB5zUdlGY6aIGp+I0oEnwRRSRQ1LOvrDlojEH8JE8HjiKXALdJrvNPiG+HY3GZEO8pZjEZtc3UoBUh7+SVyjK7Xolu7aRYYeUyzrCapoETWypER1jbrJXnFV23hCosBAVSoXO6oIPNJSmbuEDfGafOhuCSHkpr1yjTp35RXYqmCESzRzWf5+nFXLqncqeFo4ohoxbiYQVpVQbOZF81p8o9zg6xeRE7qMeOv+XN7enXGJ4/s3qNFQpfkSMqRdBITN1VpX7jyfEAjvxBNc7PDfDJZmEPY338ZIY5nFFcmzJSWjVrboFt2sMFv+A==" } ] , "role" : "model" } , "finishReason" : "STOP" , "index" : 0 } ] , # Remainder of response... You can confirm that you received a signature and see what a signature looks like using the following code: # Step 2: Call the model with function declarations # ...Generation config, Configure the client, and Define user prompt (No changes) # Send request with declarations (using a thinking model) response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents ) # See thought signatures for part in response . candidates [ 0 ] . content . parts : if part . thought_signature : print ( "Thought signature:" ) print ( part . thought_signature ) Returning signatures back to the server In order to return signatures back: You should return signatures along with their containing parts back to the server You shouldn't merge a part with a signature with another part which also contains a signature. The signature string is not concatenable You shouldn't merge one part with a signature with another part without a signature. This breaks the correct positioning of the thought represented by the signature. The code will remain the same as in Step 4 of the previous section. But in this case (as indicated in the comment below) you will return signatures to the model along with the result of the function execution so the model can incorporate the thoughts into its final response: Python # Step 4: Create user friendly response with function result and call the model again # ...Create a function response part (No change) # Append thought \ No newline at end of file diff --git a/docstore/c85d7dc0-b2ff-4827-b295-766ce079d424 b/docstore/c85d7dc0-b2ff-4827-b295-766ce079d424 new file mode 100644 index 0000000000000000000000000000000000000000..8450328bfc49bb62b82cf9cb2ac9ca979a8f4eef --- /dev/null +++ b/docstore/c85d7dc0-b2ff-4827-b295-766ce079d424 @@ -0,0 +1 @@ +Python import asyncio import wave from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : wf = wave . open ( "audio.wav" , "wb" ) wf . setnchannels ( 1 ) wf . setsampwidth ( 2 ) wf . setframerate ( 24000 ) message = "Hello how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . data is not None : wf . writeframes ( response . data ) # Un-comment this code to print audio data info # if response.server_content.model_turn is not None: # print(response.server_content.model_turn.parts[0].inline_data.mime_type) wf . close () if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : \ No newline at end of file diff --git a/docstore/c8ace79f-7d4e-43bb-8fc4-a5e744291351 b/docstore/c8ace79f-7d4e-43bb-8fc4-a5e744291351 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/c8ace79f-7d4e-43bb-8fc4-a5e744291351 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/c8c39186-e509-45d5-a18f-3c88bd8db0ee b/docstore/c8c39186-e509-45d5-a18f-3c88bd8db0ee new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/c8c39186-e509-45d5-a18f-3c88bd8db0ee @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/c8c64a70-c570-4960-8334-a1b8629923fd b/docstore/c8c64a70-c570-4960-8334-a1b8629923fd new file mode 100644 index 0000000000000000000000000000000000000000..933572d214295ba3502f9da18dc8703a97691ed7 --- /dev/null +++ b/docstore/c8c64a70-c570-4960-8334-a1b8629923fd @@ -0,0 +1 @@ +captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). * Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds, and their general characteristics. * Thesis Statement: State the main point of the essay, focusing on the remarkable adaptations, behaviors, and ecological significance that make hummingbirds extraordinary. ..... (gemini-2.5-flash) Zero-shot vs few-shot prompts You can include examples in the prompt that show the model what getting it right looks like. The model attempts to identify patterns and relationships from the examples and applies them when generating a response. Prompts that contain a few examples are called few-shot prompts, while prompts that provide no examples are called zero-shot prompts. Few-shot prompts are often used to regulate the formatting, phrasing, scoping, or general patterning of model responses. Use specific and varied examples to help the model narrow its focus and generate more accurate results. We recommend to always include few-shot examples in your prompts. Prompts without few-shot examples are likely to be less effective. In fact, you can remove instructions from your prompt if your examples are clear enough in showing the task at hand. The following zero-shot prompt asks the model to choose the best explanation. Prompt: Please choose the best explanation to the question: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Explanation1 is the better explanation because it provides more detail on the process, including how ice crystals combine and grow into snowflakes as they fall through the atmosphere. (gemini-2.5-flash) If your use case requires the \ No newline at end of file diff --git a/docstore/c8d6874f-627b-45e6-a074-01ded7e3c0c8 b/docstore/c8d6874f-627b-45e6-a074-01ded7e3c0c8 new file mode 100644 index 0000000000000000000000000000000000000000..a6b24f2239a055f1d65a9fc8ecf3b6c3213634db --- /dev/null +++ b/docstore/c8d6874f-627b-45e6-a074-01ded7e3c0c8 @@ -0,0 +1 @@ +min | 12/2/2023 | 12/4/2023 | | Physics | In progress | 2 hrs | 12/3/2023 | 12/5/2023 | | Biology | Not started | 6 hrs | 11/2/2016 | 12/6/2023 | Prompt Model response Provide a list of all the following attributes: ingredients, type of cuisine, vegetarian or not, in JSON format ```json { "ingredients": [ "rice", "shrimp", "clams", "mussels", "peas", "tomatoes", "onions", "garlic", "olive oil", "paprika", "salt", "pepper" ], "type of cuisine": "Spanish", "vegetarian": false } ``` Put your image first for single-image prompts While Gemini can interpret image and text in any order within a prompt, placing a single image before the text prompt might lead to better results. In the following samples, you'll notice the image comes first before the text in each prompt. Troubleshooting your multimodal prompt You might need to troubleshoot your prompt if you are not getting a helpful response. Here are a few strategies you could try. If the model is not drawing information from the relevant part of the image To get a more specific response, you can point out which aspects of the image you want the prompt to draw information from to inform its response. Prompt Model response How many days will these diapers last a baby? The diapers will last for 198 days before they run out. Updated prompt Improved response How long will these diapers last before I run out? Use the weight shown on the box to determine the child's age, and use the total number of diapers in the box. Divide the total number by how many diapers the child goes through per day. The box contains 198 diapers. The box states the diapers are for a child weighing 22-37 lbs. The average child in this weight range is likely between 1 and 2 years old. A 2-year-old typically goes through 6 diapers per day, therefore, the diapers will last around 33 days (198/6). If the model output is too generic and not tailored enough to the image input To help the model tailor its response to the image(s), try asking it to describe the \ No newline at end of file diff --git a/docstore/c8e3728d-b71e-48c3-8e8b-edbfdbb4bad2 b/docstore/c8e3728d-b71e-48c3-8e8b-edbfdbb4bad2 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/c8e3728d-b71e-48c3-8e8b-edbfdbb4bad2 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/c8ec8cc5-851b-4437-9f40-eb81130b8f5b b/docstore/c8ec8cc5-851b-4437-9f40-eb81130b8f5b new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/c8ec8cc5-851b-4437-9f40-eb81130b8f5b @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/c90e73e1-8ef0-4d49-a0a1-c8aacbd79605 b/docstore/c90e73e1-8ef0-4d49-a0a1-c8aacbd79605 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/c90e73e1-8ef0-4d49-a0a1-c8aacbd79605 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/c9223c2d-77c5-424e-ae17-b9db4201ec81 b/docstore/c9223c2d-77c5-424e-ae17-b9db4201ec81 new file mode 100644 index 0000000000000000000000000000000000000000..17bfa7cb7ce514bf0ade86c26f1bff30fbb20a2e --- /dev/null +++ b/docstore/c9223c2d-77c5-424e-ae17-b9db4201ec81 @@ -0,0 +1 @@ +model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Describe this audio clip" , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } localAudioPath := "/path/to/sample.mp3" uploadedFile , _ := client . Files . UploadFromPath ( ctx , localAudioPath , nil , ) parts := [] * genai . Part { genai . NewPartFromText ( "Describe this audio clip" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST AUDIO_PATH = "path/to/sample.mp3" MIME_TYPE = $( file -b --mime-type " ${ AUDIO_PATH } " ) NUM_BYTES = $( wc -c < " ${ AUDIO_PATH } " ) DISPLAY_NAME = AUDIO tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ \ No newline at end of file diff --git a/docstore/c9445a6c-7f75-4fb6-8846-8501efe8bef4 b/docstore/c9445a6c-7f75-4fb6-8846-8501efe8bef4 new file mode 100644 index 0000000000000000000000000000000000000000..c2369ca5049154f630fe926e06160c0364720f7c --- /dev/null +++ b/docstore/c9445a6c-7f75-4fb6-8846-8501efe8bef4 @@ -0,0 +1 @@ +const turns = await handleTurn (); // Combine audio data strings and save as wave file const combinedAudio = turns . reduce (( acc , turn ) = > { if ( turn . data ) { const buffer = Buffer . from ( turn . data , 'base64' ); const intArray = new Int16Array ( buffer . buffer , buffer . byteOffset , buffer . byteLength / Int16Array . BYTES_PER_ELEMENT ); return acc . concat ( Array . from ( intArray )); } return acc ; }, []); const audioBuffer = new Int16Array ( combinedAudio ); const wf = new WaveFile (); wf . fromScratch ( 1 , 24000 , '16' , audioBuffer ); // output is 24kHz fs . writeFileSync ( 'audio.wav' , wf . toBuffer ()); session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); What's next Read the full Live API Capabilities guide for key capabilities and configurations; including Voice Activity Detection and native audio features. Read the Tool use guide to learn how to integrate Live API with tools and function calling. Read the Session management guide for managing long running conversations. Read the Ephemeral tokens guide for secure authentication in client-to-server applications. For more information about the underlying WebSockets API, see the WebSockets API reference . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/c9551b70-4d88-4f25-9ba0-88f8c026a8da b/docstore/c9551b70-4d88-4f25-9ba0-88f8c026a8da new file mode 100644 index 0000000000000000000000000000000000000000..c3dd5ccab4f70d7a2a8a4ffb78dc41dee62c1a3f --- /dev/null +++ b/docstore/c9551b70-4d88-4f25-9ba0-88f8c026a8da @@ -0,0 +1 @@ +with texts to make sense, use whatever order is most natural. Troubleshooting your multimodal prompt If the model is not drawing information from the relevant part of the image: Drop hints with which aspects of the image you want the prompt to draw information from. If the model output is too generic (not tailored enough to the image/video input): At the start of the prompt, try asking the model to describe the image(s) or video before providing the task instruction, or try asking the model to refer to what's in the image. To troubleshoot which part failed: Ask the model to describe the image, or ask the model to explain its reasoning, to gauge the model's initial understanding. If your prompt results in hallucinated content: Try dialing down the temperature setting or asking the model for shorter descriptions so that it's less likely to extrapolate additional details. Tuning the sampling parameters: Experiment with different temperature settings and top-k selections to adjust the model's creativity. Be specific in your instructions Prompts have the most success when they are clear and detailed. If you have a specific output in mind, it's better to include that requirement in the prompt to ensure you get the output you want. For this image of an airport board, asking the model to just "describe this image" could generate a general description. If you need the model to parse the time and city from the image, you can include that request directly in your prompt. Prompt Model response Describe this image. The image shows an airport arrivals and departures board. Updated prompt Improved response Parse the time and city from the airport board shown in this image into a list. 10:50 Moscow 11:05 Edinburgh 11:05 London 11:10 Bucharest 11:30 Kiev 11:35 Dublin 11:45 East Midlands 12:15 Sofia 12:30 London 12:30 Newcastle 12:40 St Petersburg 12:40 London 12:45 Manchester Add a few examples The Gemini model can accept multiple inputs which it can use as examples to understand \ No newline at end of file diff --git a/docstore/c989040f-0958-4f7b-b9fd-c40b5edad2ac b/docstore/c989040f-0958-4f7b-b9fd-c40b5edad2ac new file mode 100644 index 0000000000000000000000000000000000000000..cb7319bb995c708a315638a1177ba448f5c39210 --- /dev/null +++ b/docstore/c989040f-0958-4f7b-b9fd-c40b5edad2ac @@ -0,0 +1 @@ +: "Powers the spinning disco ball." , "parameters" : { "type" : "object" , "properties" : { "power" : { "type" : "boolean" , "description" : "Whether to turn the disco ball on or off." , } }, "required" : [ "power" ], }, } start_music = { "name" : "start_music" , "description" : "Play some music matching the specified parameters." , "parameters" : { "type" : "object" , "properties" : { "energetic" : { "type" : "boolean" , "description" : "Whether the music is energetic or not." , }, "loud" : { "type" : "boolean" , "description" : "Whether the music is loud or not." , }, }, "required" : [ "energetic" , "loud" ], }, } dim_lights = { "name" : "dim_lights" , "description" : "Dim the lights." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "number" , "description" : "The brightness of the lights, 0.0 is off, 1.0 is full." , } }, "required" : [ "brightness" ], }, } JavaScript import { Type } from '@google/genai' ; const powerDiscoBall = { name : 'power_disco_ball' , description : 'Powers the spinning disco ball.' , parameters : { type : Type . OBJECT , properties : { power : { type : Type . BOOLEAN , description : 'Whether to turn the disco ball on or off.' } }, required : [ 'power' ] } }; const startMusic = { name : 'start_music' , description : 'Play some music matching the specified parameters.' , parameters : { type : Type . OBJECT , properties : { energetic : { type : Type . BOOLEAN , description : 'Whether the music is energetic or not.' }, loud : { type : Type . BOOLEAN , description : 'Whether the music is loud or not.' } }, required : [ 'energetic' , 'loud' ] } }; const dimLights = { name : 'dim_lights' , description : 'Dim the lights.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'The brightness of the lights, 0.0 is off, 1.0 is full.' } }, required : [ 'brightness' ] } }; Configure the function calling mode to allow using all of the specified tools. To learn more, \ No newline at end of file diff --git a/docstore/c9afca42-8da2-406a-ba53-5aaf67929824 b/docstore/c9afca42-8da2-406a-ba53-5aaf67929824 new file mode 100644 index 0000000000000000000000000000000000000000..cb7319bb995c708a315638a1177ba448f5c39210 --- /dev/null +++ b/docstore/c9afca42-8da2-406a-ba53-5aaf67929824 @@ -0,0 +1 @@ +: "Powers the spinning disco ball." , "parameters" : { "type" : "object" , "properties" : { "power" : { "type" : "boolean" , "description" : "Whether to turn the disco ball on or off." , } }, "required" : [ "power" ], }, } start_music = { "name" : "start_music" , "description" : "Play some music matching the specified parameters." , "parameters" : { "type" : "object" , "properties" : { "energetic" : { "type" : "boolean" , "description" : "Whether the music is energetic or not." , }, "loud" : { "type" : "boolean" , "description" : "Whether the music is loud or not." , }, }, "required" : [ "energetic" , "loud" ], }, } dim_lights = { "name" : "dim_lights" , "description" : "Dim the lights." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "number" , "description" : "The brightness of the lights, 0.0 is off, 1.0 is full." , } }, "required" : [ "brightness" ], }, } JavaScript import { Type } from '@google/genai' ; const powerDiscoBall = { name : 'power_disco_ball' , description : 'Powers the spinning disco ball.' , parameters : { type : Type . OBJECT , properties : { power : { type : Type . BOOLEAN , description : 'Whether to turn the disco ball on or off.' } }, required : [ 'power' ] } }; const startMusic = { name : 'start_music' , description : 'Play some music matching the specified parameters.' , parameters : { type : Type . OBJECT , properties : { energetic : { type : Type . BOOLEAN , description : 'Whether the music is energetic or not.' }, loud : { type : Type . BOOLEAN , description : 'Whether the music is loud or not.' } }, required : [ 'energetic' , 'loud' ] } }; const dimLights = { name : 'dim_lights' , description : 'Dim the lights.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'The brightness of the lights, 0.0 is off, 1.0 is full.' } }, required : [ 'brightness' ] } }; Configure the function calling mode to allow using all of the specified tools. To learn more, \ No newline at end of file diff --git a/docstore/c9bc17ae-6c2c-4ec0-8f58-b9777d3c86bc b/docstore/c9bc17ae-6c2c-4ec0-8f58-b9777d3c86bc new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/c9bc17ae-6c2c-4ec0-8f58-b9777d3c86bc @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/c9ca2db2-7f20-4f97-8103-5c0bbfda86a9 b/docstore/c9ca2db2-7f20-4f97-8103-5c0bbfda86a9 new file mode 100644 index 0000000000000000000000000000000000000000..66605c2cc9856bc6e96789ee99bba119781fed83 --- /dev/null +++ b/docstore/c9ca2db2-7f20-4f97-8103-5c0bbfda86a9 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models#gemini-embedding Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/c9d32d13-aa80-463c-8382-a80b36c111d2 b/docstore/c9d32d13-aa80-463c-8382-a80b36c111d2 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/c9d32d13-aa80-463c-8382-a80b36c111d2 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/c9d43450-b539-435d-8bd5-257a7f8b919c b/docstore/c9d43450-b539-435d-8bd5-257a7f8b919c new file mode 100644 index 0000000000000000000000000000000000000000..8ae055ee25ee10e0bf5368b2d5c01f7fd2abd6b2 --- /dev/null +++ b/docstore/c9d43450-b539-435d-8bd5-257a7f8b919c @@ -0,0 +1 @@ +model can then retrieve content from the URLs and use that content to inform and shape its response. You can try examples of using tools with thinking models in the Thinking cookbook . What's next? To work through more in depth examples, like: Using tools with thinking Streaming with thinking Adjusting the thinking budget for different results and more, try our Thinking cookbook . Thinking coverage is now available in our OpenAI Compatibility guide. For more info about Gemini 2.5 Pro, Gemini Flash 2.5, and Gemini 2.5 Flash-Lite, visit the model page . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-11 UTC. \ No newline at end of file diff --git a/docstore/c9d91c27-b43c-431e-a067-5fb263c0becb b/docstore/c9d91c27-b43c-431e-a067-5fb263c0becb new file mode 100644 index 0000000000000000000000000000000000000000..bdac38d149644927b7d126aee3930efd52a696eb --- /dev/null +++ b/docstore/c9d91c27-b43c-431e-a067-5fb263c0becb @@ -0,0 +1 @@ +turn_off_the_lights_schema = { 'name' : 'turn_off_the_lights' } prompt = """ Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? """ tools = [ { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} ] await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Light control schemas const turnOnTheLightsSchema = { name : 'turn_on_the_lights' }; const turnOffTheLightsSchema = { name : 'turn_off_the_lights' }; const prompt = ` Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? ` ; const tools = [ { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } ]; await run ( prompt , tools = tools , modality = "AUDIO" ) Function calling modes The Gemini API lets you control how the model uses the provided tools (function declarations). Specifically, you can set the mode within the. function_calling_config . AUTO (Default) : The model decides whether to generate a natural language response or suggest a function call based on the prompt and context. This is the most flexible mode and recommended for most scenarios. ANY : The model is constrained to always predict a function call and guarantees function schema adherence. If allowed_function_names is not specified, the model can choose from any of the provided function declarations. If allowed_function_names is provided as a list, the model can only choose from the functions in that list. Use this mode when you require a function call response to every prompt (if applicable). NONE : The model is prohibited from making function calls. This is equivalent to sending a request without any function declarations. Use this to temporarily disable function calling without removing your tool definitions. Python from google.genai import types # Configure function calling mode tool_config = types . ToolConfig ( \ No newline at end of file diff --git a/docstore/c9e8e154-99a5-42ea-bb67-c32cd4b321f5 b/docstore/c9e8e154-99a5-42ea-bb67-c32cd4b321f5 new file mode 100644 index 0000000000000000000000000000000000000000..c13a60d35fca6df29718afae0829c4c40262f690 --- /dev/null +++ b/docstore/c9e8e154-99a5-42ea-bb67-c32cd4b321f5 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/migrate-to-cloud#main-content Title: Gemini Developer API v.s. Vertex AI | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/c9f0f908-4d04-4062-9d9a-01449a1cf7ae b/docstore/c9f0f908-4d04-4062-9d9a-01449a1cf7ae new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/c9f0f908-4d04-4062-9d9a-01449a1cf7ae @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/ca0dfa74-1d6d-4c57-9780-184da7944200 b/docstore/ca0dfa74-1d6d-4c57-9780-184da7944200 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/ca0dfa74-1d6d-4c57-9780-184da7944200 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/ca15caac-c4fd-4047-9561-636eee891d37 b/docstore/ca15caac-c4fd-4047-9561-636eee891d37 new file mode 100644 index 0000000000000000000000000000000000000000..c25aa08b43110ad03ba57bdce48865495fdfb941 --- /dev/null +++ b/docstore/ca15caac-c4fd-4047-9561-636eee891d37 @@ -0,0 +1 @@ +fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ], "generationConfig": { "thinkingConfig": { "thinkingBudget": 0 } } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } System instructions and other configurations You can guide the behavior of Gemini models with system instructions. To do so, pass a GenerateContentConfig object. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , config = types . GenerateContentConfig ( system_instruction = "You are a cat. Your name is Neko." ), contents = "Hello there" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Hello there" , config : { systemInstruction : "You are a cat. Your name is Neko." , }, }); console . log ( response . text ); } await main (); \ No newline at end of file diff --git a/docstore/ca437275-428c-46d0-b593-b9ea48456856 b/docstore/ca437275-428c-46d0-b593-b9ea48456856 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/ca437275-428c-46d0-b593-b9ea48456856 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/ca437f7d-32f5-4b99-9001-9b0b73729b09 b/docstore/ca437f7d-32f5-4b99-9001-9b0b73729b09 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/ca437f7d-32f5-4b99-9001-9b0b73729b09 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/ca45a921-4a65-4275-b832-be3d3282e20e b/docstore/ca45a921-4a65-4275-b832-be3d3282e20e new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/ca45a921-4a65-4275-b832-be3d3282e20e @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/ca5050a5-def6-4c63-af3a-978a69bfb640 b/docstore/ca5050a5-def6-4c63-af3a-978a69bfb640 new file mode 100644 index 0000000000000000000000000000000000000000..750217269b5a3e53dd2d92de822b68418a6799df --- /dev/null +++ b/docstore/ca5050a5-def6-4c63-af3a-978a69bfb640 @@ -0,0 +1 @@ +' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Get metadata for a file You can verify that the API successfully stored the uploaded file and get its metadata by calling files.get . Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) file_name = myfile . name myfile = client . files . get ( name = file_name ) print ( myfile ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; const fetchedFile = await ai . files . get ({ name : fileName }); console . log ( fetchedFile ); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } gotFile , err := client . GetFile ( ctx , file . Name ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Got file:" , gotFile . Name ) REST # file_info.json was created in the upload example name = $( jq ".file.name" file_info.json ) # Get the file of interest to check state curl https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " > file_info.json # Print some information about the file you got name = $( jq ".file.name" file_info.json ) echo name = $name file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri List uploaded files You can upload multiple files using the Files API. The following code gets a list of all the files uploaded: Python print ( 'My files:' ) for f in client . files . list (): print ( ' ' , f . name ) JavaScript const listResponse = await ai . files . list ({ config : { pageSize : 10 } }); for await ( const file of listResponse ) { console . log ( file . name ); } Go iter := client . ListFiles ( ctx ) for { ifile , err := iter . Next () if err == iterator . Done { break } if err != nil { log . Fatal ( err ) } fmt . Println ( ifile . Name ) } REST echo "My files: " curl \ No newline at end of file diff --git a/docstore/ca5786d9-1fc5-4671-a06e-f4c9315b5f32 b/docstore/ca5786d9-1fc5-4671-a06e-f4c9315b5f32 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/ca5786d9-1fc5-4671-a06e-f4c9315b5f32 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/ca7cf8e5-d1ca-4ca7-8108-5e0a64a2d3eb b/docstore/ca7cf8e5-d1ca-4ca7-8108-5e0a64a2d3eb new file mode 100644 index 0000000000000000000000000000000000000000..7a3641a5c2a43bb51a841e73c15da5cf081ba66c --- /dev/null +++ b/docstore/ca7cf8e5-d1ca-4ca7-8108-5e0a64a2d3eb @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/safety-settings Title: Safety settings | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/ca98f133-a1b7-41fe-9672-d35ae8c70b4c b/docstore/ca98f133-a1b7-41fe-9672-d35ae8c70b4c new file mode 100644 index 0000000000000000000000000000000000000000..a8da0e92acb39cd3b0a95b76d425835072520bc1 --- /dev/null +++ b/docstore/ca98f133-a1b7-41fe-9672-d35ae8c70b4c @@ -0,0 +1 @@ +Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/docstore/caaaf85c-f9cf-450f-ba2c-47321abcf02d b/docstore/caaaf85c-f9cf-450f-ba2c-47321abcf02d new file mode 100644 index 0000000000000000000000000000000000000000..a7a45ba33fb66e79b4a3cb3ed45264da6b432660 --- /dev/null +++ b/docstore/caaaf85c-f9cf-450f-ba2c-47321abcf02d @@ -0,0 +1 @@ +Gemini API quickstart | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini API quickstart This quickstart shows you how to install our libraries and make your first Gemini API request. Before you begin You need a Gemini API key. If you don't already have one, you can get it for free in Google AI Studio . Install the Google GenAI SDK Python Using Python 3.9+ , install the google-genai package using the following pip command : pip install -q -U google-genai JavaScript Using Node.js v18+ , install the Google Gen AI SDK for TypeScript and JavaScript using the following npm command : npm install @google/genai Go Install google.golang.org/genai in your module directory using the go get command : go get google.golang.org/genai Java If you're using Maven, you can install google-genai by adding the following to your dependencies: com.google.genai google-genai 1.0.0 Apps Script To create a new Apps Script project, go to script.new . Click Untitled project . Rename the Apps Script project AI Studio and click Rename . Set your API key At the left, click Project Settings . Under Script Properties click Add script property . For Property , enter the key name: GEMINI_API_KEY . For Value , enter the value for the API key. Click Save script properties . Replace the Code.gs file contents with the following code: Make your first request Here is an example that uses the generateContent method to send a request to the Gemini API using the Gemini 2.5 Flash model. If you set your API key as the environment variable GEMINI_API_KEY , it will be \ No newline at end of file diff --git a/docstore/caba9c97-0cf7-4a56-89a8-5d3fe78e36c6 b/docstore/caba9c97-0cf7-4a56-89a8-5d3fe78e36c6 new file mode 100644 index 0000000000000000000000000000000000000000..987408a9d5fce99ac4990dd87d63b0d860b9553d --- /dev/null +++ b/docstore/caba9c97-0cf7-4a56-89a8-5d3fe78e36c6 @@ -0,0 +1 @@ +overwhelming dominance of digital streaming and on-demand services; a "smart" approach would therefore require a highly specialized niche, a strong value proposition beyond just the disc, and likely a focus on community or collector markets rather than mainstream consumption. Here's a breakdown of what a smart DVD business in 2025 would likely entail: 1. Extreme Niche Specialization: You cannot compete with streaming libraries. Your business must focus on what digital doesn't offer easily or at all. This could include: * Out-of-Print & Rare Titles: Sourcing films or TV shows that have never been released digitally or are currently unavailable on major streaming platforms. .... (gemini-2.5-flash) Format responses with the completion strategy The completion strategy can also help format the response. The following example prompts the model to create an essay outline: Prompt: Create an outline for an essay about hummingbirds. Response: Here is a possible outline for an essay about hummingbirds: I. Introduction A. Hook: Start with a captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). B. Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds. C. Thesis Statement: State the main point of the essay, which could be about the remarkable adaptations, behaviors, and ecological importance of hummingbirds. II. Appearance and Identification ..... (gemini-2.5-flash) The prompt didn't specify the format for the outline and the model chose a format for you. To get the model to return an outline in a specific format, you can add text that represents the start of the outline and let the model complete it based on the pattern that you initiated. Prompt: Create an outline for an essay about hummingbirds. I. Introduction * Response: Okay, here is an outline for an essay about hummingbirds, starting from the introduction you provided: I. Introduction * Hook: Start with a \ No newline at end of file diff --git a/docstore/cabae166-140f-4737-8f9b-b76b866f95f8 b/docstore/cabae166-140f-4737-8f9b-b76b866f95f8 new file mode 100644 index 0000000000000000000000000000000000000000..c651df5a8dc16b6ecacfcfbb978b8a98fa66456b --- /dev/null +++ b/docstore/cabae166-140f-4737-8f9b-b76b866f95f8 @@ -0,0 +1 @@ +Live API capabilities guide | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Live API capabilities guide Preview: The Live API is in preview. This is a comprehensive guide that covers capabilities and configurations available with the Live API. See Get started with Live API page for a overview and sample code for common use cases. Before you begin Familiarize yourself with core concepts: If you haven't already done so, read the Get started with Live API page first. This will introduce you to the fundamental principles of the Live API, how it works, and the distinction between the different models and their corresponding audio generation methods ( native audio or half-cascade). Try the Live API in AI Studio: You may find it useful to try the Live API in Google AI Studio before you start building. To use the Live API in Google AI Studio, select Stream . Establishing a connection The following example shows how to create a connection with an API key: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : print ( "Session started" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function main () { const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug \ No newline at end of file diff --git a/docstore/cae65f7d-f011-4b06-b7de-26358bc333e9 b/docstore/cae65f7d-f011-4b06-b7de-26358bc333e9 new file mode 100644 index 0000000000000000000000000000000000000000..a8da0e92acb39cd3b0a95b76d425835072520bc1 --- /dev/null +++ b/docstore/cae65f7d-f011-4b06-b7de-26358bc333e9 @@ -0,0 +1 @@ +Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/docstore/caf3af0b-9732-4120-b154-47a4343adc61 b/docstore/caf3af0b-9732-4120-b154-47a4343adc61 new file mode 100644 index 0000000000000000000000000000000000000000..019b7de7e49d445c43758810d78952e4f88cd47b --- /dev/null +++ b/docstore/caf3af0b-9732-4120-b154-47a4343adc61 @@ -0,0 +1 @@ +prompt const contents = [ { role : 'user' , parts : [{ text : 'Turn the lights down to a romantic level' }] } ]; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( response . functionCalls [ 0 ]); The model then returns a functionCall object in an OpenAPI compatible schema specifying how to call one or more of the declared functions in order to respond to the user's question. Python id = None args = { 'color_temp' : 'warm' , 'brightness' : 25 } name = 'set_light_values' JavaScript { name : 'set_light_values' , args : { brightness : 25 , color_temp : 'warm' } } Step 3: Execute set_light_values function code Extract the function call details from the model's response, parse the arguments , and execute the set_light_values function. Python # Extract tool call details, it may not be in the first part. tool_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call if tool_call . name == "set_light_values" : result = set_light_values ( ** tool_call . args ) print ( f "Function execution result: { result } " ) JavaScript // Extract tool call details const tool_call = response . functionCalls [ 0 ] let result ; if ( tool_call . name === 'set_light_values' ) { result = setLightValues ( tool_call . args . brightness , tool_call . args . color_temp ); console . log ( `Function execution result: ${ JSON . stringify ( result ) } ` ); } Step 4: Create user friendly response with function result and call the model again Finally, send the result of the function execution back to the model so it can incorporate this information into its final response to the user. Python # Create a function response part function_response_part = types . Part . from_function_response ( name = tool_call . name , response = { "result" : result }, ) # Append function call and result of the function execution to contents contents . append ( response . \ No newline at end of file diff --git a/docstore/cb08a742-34a3-43b0-b899-cf2516a7fa8b b/docstore/cb08a742-34a3-43b0-b899-cf2516a7fa8b new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/cb08a742-34a3-43b0-b899-cf2516a7fa8b @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/cb0ebd9b-5b2d-40eb-a081-09a2e9d1b40e b/docstore/cb0ebd9b-5b2d-40eb-a081-09a2e9d1b40e new file mode 100644 index 0000000000000000000000000000000000000000..fc6b27785ffc2bef06b5f2067401216e3ff34ff1 --- /dev/null +++ b/docstore/cb0ebd9b-5b2d-40eb-a081-09a2e9d1b40e @@ -0,0 +1 @@ +"cheeseburger" was excluded from the output because it wasn't a part of the order. Constraints Specify any constraints on reading the prompt or generating a response. You can tell the model what to do and not to do. For example, you can specify a constraint in the prompt on how long you want a summary to be: Prompt: Summarize this text in one sentence: Text: A quantum computer exploits quantum mechanical phenomena to perform calculations exponentially faster than any modern traditional computer. At very tiny scales, physical matter acts as both particles and as waves, and quantum computing uses specialized hardware to leverage this behavior. The operating principles of quantum devices is beyond the scope of classical physics. When deployed at scale, quantum computers could be used in a wide variety of applications such as: in cybersecurity to break existing encryption methods while helping researchers create new ones, in meteorology to develop better weather forecasting etc. However, the current state of the art quantum computers are still largely experimental and impractical. Response: Exploiting quantum mechanical phenomena, quantum computers can perform calculations exponentially faster than traditional computers for potential applications like cybersecurity and meteorology, although they are currently largely experimental and impractical. (gemini-2.5-flash) Response format You can give instructions that specify the format of the response. For example, you can ask for the response to be formatted as a table, bulleted list, elevator pitch, keywords, sentence, or paragraph. The following system instruction tells the model to be more conversational in its response: System instruction All questions should be answered comprehensively with details, unless the user requests a concise response specifically. Prompt What is a smart way to make a business that sells DVD's in 2025? Response: Starting a business selling DVDs in 2025 presents significant challenges due to the \ No newline at end of file diff --git a/docstore/cb20043d-2496-4c69-9e31-d8ea9b26b99d b/docstore/cb20043d-2496-4c69-9e31-d8ea9b26b99d new file mode 100644 index 0000000000000000000000000000000000000000..48ebc0d450e476e2d2310fffefae223b737ab72c --- /dev/null +++ b/docstore/cb20043d-2496-4c69-9e31-d8ea9b26b99d @@ -0,0 +1 @@ +Because standard Gemini API text and content generation calls are stateless, when using thinking in multi-turn interactions (such as chat), the model doesn't have access to thought context from previous turns. You can maintain thought context using thought signatures, which are encrypted representations of the model's internal thought process. The model returns thought signatures in the response object when thinking and function calling are enabled. To ensure the model maintains context across multiple turns of a conversation, you must provide the thought signatures back to the model in the subsequent requests. You will receive thought signatures when: Thinking is enabled and thoughts are generated. The request includes function declarations . Note: Thought signatures are only available when you're using function calling, specifically, your request must include function declarations . You can find an example of thinking with function calls on the Function calling page. Other usage limitations to consider with function calling include: Signatures are returned from the model within other parts in the response, for example function calling or text parts. Return the entire response with all parts back to the model in subsequent turns. Don't concatenate parts with signatures together. Don't merge one part with a signature with another part without a signature. Pricing Note: Summaries are available in the free and paid tiers of the API. Thought signatures will increase the input tokens you are charged when sent back as part of the request. When thinking is turned on, response pricing is the sum of output tokens and thinking tokens. You can get the total number of generated thinking tokens from the thoughtsTokenCount field. Python # ... print ( "Thoughts tokens:" , response . usage_metadata . thoughts_token_count ) print ( "Output tokens:" , response . usage_metadata . candidates_token_count ) JavaScript // ... console . log ( `Thoughts tokens: ${ response . usageMetadata \ No newline at end of file diff --git a/docstore/cb44a5f3-359c-47fd-96f8-201becfa071f b/docstore/cb44a5f3-359c-47fd-96f8-201becfa071f new file mode 100644 index 0000000000000000000000000000000000000000..8d1e7e8a7a566917639b9515f69a698a37f27a8d --- /dev/null +++ b/docstore/cb44a5f3-359c-47fd-96f8-201becfa071f @@ -0,0 +1 @@ +to avoid. ⛔ Negative pattern: Example Don't end haikus with a question: Haiku are fun A short and simple poem Don't you enjoy them? ✅ Positive pattern: Example Always end haikus with an assertion: Haiku are fun A short and simple poem A joy to write Consistent formatting Make sure that the structure and formatting of few-shot examples are the same to avoid responses with undesired formats. One of the primary objectives of adding few-shot examples in prompts is to show the model the response format. Therefore, it is essential to ensure a consistent format across all examples, especially paying attention to XML tags, white spaces, newlines, and example splitters. Add context You can include instructions and information in a prompt that the model needs to solve a problem, instead of assuming that the model has all of the required information. This contextual information helps the model understand the constraints and details of what you're asking for it to do. The following example asks the model to give troubleshooting guidance for a router: Prompt: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Response: A yellow blinking light on your Google Wifi router usually indicates a problem with the internet connection. Here are some steps to troubleshoot and fix the issue: 1. Restart your router and modem: - Unplug your modem from the power outlet and wait for 30 seconds. - Plug your modem back in and wait for it to fully boot up. - Unplug your Google Wifi router and wait for 30 seconds. - Plug your Google Wifi router back in and wait for it to fully boot up. ..... (gemini-2.5-flash) The response looks like generic troubleshooting information that's not specific to the router or the status of the LED indicator lights. To customize the response for the specific router, you can add to the prompt the router's troubleshooting guide as context for it to refer to when providing a response. Prompt: Answer the question \ No newline at end of file diff --git a/docstore/cb550011-19c0-410e-a765-325d3e90617d b/docstore/cb550011-19c0-410e-a765-325d3e90617d new file mode 100644 index 0000000000000000000000000000000000000000..cd1a6469d32d4344455628e24b4f24d47cbf3ee6 --- /dev/null +++ b/docstore/cb550011-19c0-410e-a765-325d3e90617d @@ -0,0 +1 @@ +new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "List a few popular cookie recipes, and include the amounts of ingredients." , config : { responseMimeType : "application/json" , responseSchema : { type : Type . ARRAY , items : { type : Type . OBJECT , properties : { recipeName : { type : Type . STRING , }, ingredients : { type : Type . ARRAY , items : { type : Type . STRING , }, }, }, propertyOrdering : [ "recipeName" , "ingredients" ], }, }, }, }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { ResponseMIMEType : "application/json" , ResponseSchema : & genai . Schema { Type : genai . TypeArray , Items : & genai . Schema { Type : genai . TypeObject , Properties : map [ string ] * genai . Schema { "recipeName" : { Type : genai . TypeString }, "ingredients" : { Type : genai . TypeArray , Items : & genai . Schema { Type : genai . TypeString }, }, }, PropertyOrdering : [] string { "recipeName" , "ingredients" }, }, }, } result , err := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "List a few popular cookie recipes, and include the amounts of ingredients." ), config , ) if err != nil { log . Fatal ( err ) } fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "contents": [{ "parts":[ { "text": "List a few popular cookie recipes, and include the amounts of ingredients." } ] }], "generationConfig": { "responseMimeType": "application/json", "responseSchema": { "type": "ARRAY", "items": { "type": "OBJECT", "properties": { "recipeName": { \ No newline at end of file diff --git a/docstore/cb5b9458-97e4-4680-90dd-3eb825518bc8 b/docstore/cb5b9458-97e4-4680-90dd-3eb825518bc8 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/cb5b9458-97e4-4680-90dd-3eb825518bc8 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/cb82e68f-c875-4613-9b78-409ef238cb8b b/docstore/cb82e68f-c875-4613-9b78-409ef238cb8b new file mode 100644 index 0000000000000000000000000000000000000000..f71ac6c85727e3c520290c703b52e420cb1baa33 --- /dev/null +++ b/docstore/cb82e68f-c875-4613-9b78-409ef238cb8b @@ -0,0 +1 @@ +(JSONL) file. Each line in this file must be a JSON object containing a user-defined key and a request object, where the request is a valid GenerateContentRequest object. The user-defined key is used in the response to indicate which output is the result of which request. For example, the request with the key defined as request-1 will have its response annotated with the same key name. This file is uploaded using the File API . The maximum allowed file size for an input file is 2GB. The following is an example of a JSONL file. You can save it in a file named my-batch-requests.json : { "key" : "request-1" , "request" : { "contents" : [{ "parts" : [{ "text" : "Describe the process of photosynthesis." }]}], "generation_config" : { "temperature" : 0.7 }}} { "key" : "request-2" , "request" : { "contents" : [{ "parts" : [{ "text" : "What are the main ingredients in a Margherita pizza?" }]}]}} Similarly to inline requests, you can specify other parameters like system instructions, tools or other configurations in each request JSON. You can upload this file using the File API as shown in the following example. If you are working with multimodal input, you can reference other uploaded files within your JSONL file. Python from google import genai from google.genai import types client = genai . Client () # Create a sample JSONL file with open ( "my-batch-requests.jsonl" , "w" ) as f : requests = [ { "key" : "request-1" , "request" : { "contents" : [{ "parts" : [{ "text" : "Describe the process of photosynthesis." }]}]}}, { "key" : "request-2" , "request" : { "contents" : [{ "parts" : [{ "text" : "What are the main ingredients in a Margherita pizza?" }]}]}} ] for req in requests : f . write ( json . dumps ( req ) + " \n " ) # Upload the file to the File API uploaded_file = client . files . upload ( file = 'my-batch-requests.jsonl' , config = types . UploadFileConfig ( display_name = 'my-batch-requests' , mime_type = 'jsonl' ) ) print ( f "Uploaded file: { uploaded_file . name } \ No newline at end of file diff --git a/docstore/cb99a2f7-f01a-45e6-bb93-d357bf208765 b/docstore/cb99a2f7-f01a-45e6-bb93-d357bf208765 new file mode 100644 index 0000000000000000000000000000000000000000..5f8a5e922d24af531eff4f89e4f99a5736b0820b --- /dev/null +++ b/docstore/cb99a2f7-f01a-45e6-bb93-d357bf208765 @@ -0,0 +1 @@ +(`totalTokenCount`). console . log ( generateResult . response . usageMetadata ); // candidatesTokenCount and totalTokenCount depend on response, may vary // { promptTokenCount: 11, candidatesTokenCount: 124, totalTokenCount: 135 } After Python from google import genai client = genai . Client () response = client . models . count_tokens ( model = 'gemini-2.0-flash' , contents = 'The quick brown fox jumps over the lazy dog.' , ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const prompt = "The quick brown fox jumps over the lazy dog." ; const countTokensResponse = await ai . models . countTokens ({ model : "gemini-2.0-flash" , contents : prompt , }); console . log ( countTokensResponse . totalTokens ); const generateResponse = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : prompt , }); console . log ( generateResponse . usageMetadata ); Generate images Generate images: Before Python #pip install https://github.com/google-gemini/generative-ai-python@imagen import google.generativeai as genai imagen = genai . ImageGenerationModel ( "imagen-3.0-generate-001" ) gen_images = imagen . generate_images ( prompt = "Robot holding a red skateboard" , number_of_images = 1 , safety_filter_level = "block_low_and_above" , person_generation = "allow_adult" , aspect_ratio = "3:4" , ) After Python from google import genai client = genai . Client () gen_images = client . models . generate_images ( model = 'imagen-3.0-generate-001' , prompt = 'Robot holding a red skateboard' , config = types . GenerateImagesConfig ( number_of_images = 1 , safety_filter_level = "BLOCK_LOW_AND_ABOVE" , person_generation = "ALLOW_ADULT" , aspect_ratio = "3:4" , ) ) for n , image in enumerate ( gen_images . generated_images ): pathlib . Path ( f ' { n } .png' ) . write_bytes ( image . image . image_bytes ) Embed content Generate content embeddings. Before Python import google.generativeai as genai response \ No newline at end of file diff --git a/docstore/cba22094-4852-4920-b2b2-8f0afa278bef b/docstore/cba22094-4852-4920-b2b2-8f0afa278bef new file mode 100644 index 0000000000000000000000000000000000000000..bf4a48096b84622083d96343210f25866e78f754 --- /dev/null +++ b/docstore/cba22094-4852-4920-b2b2-8f0afa278bef @@ -0,0 +1 @@ +a picture of me. Can you add a llama next to me?" ), & genai . Part { InlineData : & genai . Blob { MIMEType : "image/png" , Data : imgData , }, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } config := & genai . GenerateContentConfig { ResponseModalities : [] string { "TEXT" , "IMAGE" }, } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash-preview-image-generation" , contents , config , ) for _ , part := range result . Candidates [ 0 ]. Content . Parts { if part . Text != "" { fmt . Println ( part . Text ) } else if part . InlineData != nil { imageBytes := part . InlineData . Data outputFilename := "gemini_generated_image.png" _ = os . WriteFile ( outputFilename , imageBytes , 0644 ) } } } REST IMG_PATH = /path/to/your/image1.jpeg if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMG_BASE64 = $( base64 " $B64FLAGS " " $IMG_PATH " 2>&1 ) curl -X POST \ "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-preview-image-generation:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d "{ \"contents\": [{ \"parts\":[ {\"text\": \"'Hi, This is a picture of me. Can you add a llama next to me\"}, { \"inline_data\": { \"mime_type\":\"image/jpeg\", \"data\": \" $IMG_BASE64 \" } } ] }], \"generationConfig\": {\"responseModalities\": [\"TEXT\", \"IMAGE\"]} }" \ | grep -o '"data": "[^"]*"' \ | cut -d '"' -f4 \ | base64 --decode > gemini-edited-image.png Other image generation modes Gemini supports other image interaction modes based on prompt structure and context, including: Text to image(s) and text (interleaved): Outputs images with related text. Example prompt: "Generate an illustrated recipe for a paella." Image(s) and text to image(s) and text (interleaved) : Uses input images and text to create new related images and text. Example prompt: (With an image of a furnished room) \ No newline at end of file diff --git a/docstore/cbcd7017-6a58-40a9-8a20-341880abb1d7 b/docstore/cbcd7017-6a58-40a9-8a20-341880abb1d7 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/cbcd7017-6a58-40a9-8a20-341880abb1d7 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/cbf31058-1fa7-42a3-9b3e-da8c85e1038d b/docstore/cbf31058-1fa7-42a3-9b3e-da8c85e1038d new file mode 100644 index 0000000000000000000000000000000000000000..2abf25ff1d94e62b1cf39304ec67a36de21f9d87 --- /dev/null +++ b/docstore/cbf31058-1fa7-42a3-9b3e-da8c85e1038d @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/live-guide#proactive-audio Title: Live API capabilities guide | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/cbf7744e-5565-4b82-a300-afb685a740bb b/docstore/cbf7744e-5565-4b82-a300-afb685a740bb new file mode 100644 index 0000000000000000000000000000000000000000..c651df5a8dc16b6ecacfcfbb978b8a98fa66456b --- /dev/null +++ b/docstore/cbf7744e-5565-4b82-a300-afb685a740bb @@ -0,0 +1 @@ +Live API capabilities guide | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Live API capabilities guide Preview: The Live API is in preview. This is a comprehensive guide that covers capabilities and configurations available with the Live API. See Get started with Live API page for a overview and sample code for common use cases. Before you begin Familiarize yourself with core concepts: If you haven't already done so, read the Get started with Live API page first. This will introduce you to the fundamental principles of the Live API, how it works, and the distinction between the different models and their corresponding audio generation methods ( native audio or half-cascade). Try the Live API in AI Studio: You may find it useful to try the Live API in Google AI Studio before you start building. To use the Live API in Google AI Studio, select Stream . Establishing a connection The following example shows how to create a connection with an API key: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : print ( "Session started" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function main () { const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug \ No newline at end of file diff --git a/docstore/cc33376d-b02d-4b3e-88eb-0710667cdfb0 b/docstore/cc33376d-b02d-4b3e-88eb-0710667cdfb0 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/cc33376d-b02d-4b3e-88eb-0710667cdfb0 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/cc44b057-98b2-4880-a2f1-3170ad1c24c7 b/docstore/cc44b057-98b2-4880-a2f1-3170ad1c24c7 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/cc44b057-98b2-4880-a2f1-3170ad1c24c7 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/cc493748-b84b-44f1-9c43-7784e0a1feb3 b/docstore/cc493748-b84b-44f1-9c43-7784e0a1feb3 new file mode 100644 index 0000000000000000000000000000000000000000..109bd0eaeef926ee02f7659dee842724a4ad9423 --- /dev/null +++ b/docstore/cc493748-b84b-44f1-9c43-7784e0a1feb3 @@ -0,0 +1 @@ +the libraries are stable and fully supported for production use. They are actively maintained, provide access to the latest features, and offer the best performance working with Gemini. If you're not using the Google GenAI SDK and using one of our legacy libraries, we strongly recommend you to migrate. Review the legacy libraries section for more information. Legacy libraries and migration If you are using one of our legacy libraries, we recommend that you migrate to the new libraries . The legacy libraries don't provide access to recent features (such as Live API and Veo ) and are on a deprecation path. They will stop receiving updates at the end of September 2025, the feature gaps will grow and potential bugs may no longer get fixed. Each legacy library's support status varies, detailed in the following table: Language Legacy library Support status Recommended library Python google-generativeai All support, including bug fixes, ends end of September 2025. google-genai JavaScript/TypeScript @google/generativeai All support, including bug fixes, ends end of September 2025. @google/genai Go google.golang.org/generative-ai All support, including bug fixes, ends end of September 2025. google.golang.org/genai Dart and Flutter google_generative_ai Not actively maintained Use trusted community or third party libraries, like firebase_ai , or access using REST API Swift generative-ai-swift Not actively maintained Use Gemini in Firebase Android generative-ai-android Not actively maintained Use Gemini in Firebase Note for Java developers: There was no legacy Google-provided Java SDK for the Gemini API, so no migration from a previous Google library is required. You can start directly with the new library in the Language support and installation section. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google \ No newline at end of file diff --git a/docstore/cc4a5482-4025-4dc0-93e6-bf879631c266 b/docstore/cc4a5482-4025-4dc0-93e6-bf879631c266 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/cc4a5482-4025-4dc0-93e6-bf879631c266 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/cc524acb-49cc-431b-9209-09b29ddf473b b/docstore/cc524acb-49cc-431b-9209-09b29ddf473b new file mode 100644 index 0000000000000000000000000000000000000000..fde5008e10da059aa2ac847e9fab5e369116574b --- /dev/null +++ b/docstore/cc524acb-49cc-431b-9209-09b29ddf473b @@ -0,0 +1 @@ +You can set fields as required to force the model to provide a value. If there's insufficient context in the associated input prompt, the model generates responses mainly based on the data it was trained on. A complex schema can result in an InvalidArgument: 400 error. Complexity might come from long property names, long array length limits, enums with many values, objects with lots of optional properties, or a combination of these factors. If you get this error with a valid schema, make one or more of the following changes to resolve the error: Shorten property names or enum names. Flatten nested arrays. Reduce the number of properties with constraints, such as numbers with minimum and maximum limits. Reduce the number of properties with complex constraints, such as properties with complex formats like date-time . Reduce the number of optional properties. Reduce the number of valid values for enums. If you aren't seeing the results you expect, add more context to your input prompts or revise your response schema. For example, review the model's response without structured output to see how the model responds. You can then update your response schema so that it better fits the model's output. What's next Now that you've learned how to generate structured output, you might want to try using Gemini API tools: Function calling Code execution Grounding with Google Search Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/cc64a1df-3bbf-4a88-abe6-4fdb2d0d103a b/docstore/cc64a1df-3bbf-4a88-abe6-4fdb2d0d103a new file mode 100644 index 0000000000000000000000000000000000000000..8219f10e184a0891e4bb35822a37a2ddc4e20372 --- /dev/null +++ b/docstore/cc64a1df-3bbf-4a88-abe6-4fdb2d0d103a @@ -0,0 +1 @@ +"type": "STRING" }, "ingredients": { "type": "ARRAY", "items": { "type": "STRING" } } }, "propertyOrdering": ["recipeName", "ingredients"] } } } }' 2 > /dev/null | head The output might look like this: [ { "recipeName" : "Chocolate Chip Cookies" , "ingredients" : [ "1 cup (2 sticks) unsalted butter, softened" , "3/4 cup granulated sugar" , "3/4 cup packed brown sugar" , "1 teaspoon vanilla extract" , "2 large eggs" , "2 1/4 cups all-purpose flour" , "1 teaspoon baking soda" , "1 teaspoon salt" , "2 cups chocolate chips" ] }, ... ] Providing a schema in a text prompt Instead of configuring a schema, you can supply a schema as natural language or pseudo-code in a text prompt. This method is not recommended , because it might produce lower quality output, and because the model is not constrained to follow the schema. Warning: Don't provide a schema in a text prompt if you're configuring a responseSchema . This can produce unexpected or low quality results. Here's a generic example of a schema provided in a text prompt: List a few popular cookie recipes, and include the amounts of ingredients. Produce JSON matching this specification: Recipe = { "recipeName": string, "ingredients": array } Return: array Since the model gets the schema from text in the prompt, you might have some flexibility in how you represent the schema. But when you supply a schema inline like this, the model is not actually constrained to return JSON. For a more deterministic, higher quality response, configure a schema on the model, and don't duplicate the schema in the text prompt. Generating enum values In some cases you might want the model to choose a single option from a list of options. To implement this behavior, you can pass an enum in your schema. You can use an enum option anywhere you could use a string in the responseSchema , because an enum is an array of strings. Like a JSON schema, an enum lets you constrain model output to meet the requirements of your application. \ No newline at end of file diff --git a/docstore/cc65a0f6-153c-4754-bced-f6893595e517 b/docstore/cc65a0f6-153c-4754-bced-f6893595e517 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/cc65a0f6-153c-4754-bced-f6893595e517 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/cc710d24-a314-4e5c-b634-f28e3ec68b5c b/docstore/cc710d24-a314-4e5c-b634-f28e3ec68b5c new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/cc710d24-a314-4e5c-b634-f28e3ec68b5c @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/cc719366-ea89-4e99-89b5-8f589a9eec86 b/docstore/cc719366-ea89-4e99-89b5-8f589a9eec86 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/cc719366-ea89-4e99-89b5-8f589a9eec86 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/cc8537cd-63a0-4d71-ad35-76dbf8ab9256 b/docstore/cc8537cd-63a0-4d71-ad35-76dbf8ab9256 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/cc8537cd-63a0-4d71-ad35-76dbf8ab9256 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/cc87e92a-c745-4d87-b9ee-dbc5fc1d1409 b/docstore/cc87e92a-c745-4d87-b9ee-dbc5fc1d1409 new file mode 100644 index 0000000000000000000000000000000000000000..2c0be12e5de5e086f593ed916f2fdecc774a45a4 --- /dev/null +++ b/docstore/cc87e92a-c745-4d87-b9ee-dbc5fc1d1409 @@ -0,0 +1 @@ +Text generation | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Text generation The Gemini API can generate text output from various inputs, including text, images, video, and audio, leveraging Gemini models. Here's a basic example that takes a single text input: Python from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents \ No newline at end of file diff --git a/docstore/cc9bfd0a-a0ae-4663-ab34-9c7119d7d673 b/docstore/cc9bfd0a-a0ae-4663-ab34-9c7119d7d673 new file mode 100644 index 0000000000000000000000000000000000000000..6d0fb3dda30537d366b10ce141412a20984aa796 --- /dev/null +++ b/docstore/cc9bfd0a-a0ae-4663-ab34-9c7119d7d673 @@ -0,0 +1 @@ +where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels. """ config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # set thinking_budget to 0 for better results in object detection ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ prompt , im ], # Pillow images can be directly passed as inputs (which will be converted by the SDK) config = config ) # Parse JSON response items = json . loads ( parse_json ( response . text )) # Create output directory os . makedirs ( output_dir , exist_ok = True ) # Process each mask for i , item in enumerate ( items ): # Get bounding box coordinates box = item [ "box_2d" ] y0 = int ( box [ 0 ] / 1000 * im . size [ 1 ]) x0 = int ( box [ 1 ] / 1000 * im . size [ 0 ]) y1 = int ( box [ 2 ] / 1000 * im . size [ 1 ]) x1 = int ( box [ 3 ] / 1000 * im . size [ 0 ]) # Skip invalid boxes if y0 > = y1 or x0 > = x1 : continue # Process mask png_str = item [ "mask" ] if not png_str . startswith ( "data:image/png;base64," ): continue # Remove prefix png_str = png_str . removeprefix ( "data:image/png;base64," ) mask_data = base64 . b64decode ( png_str ) mask = Image . open ( io . BytesIO ( mask_data )) # Resize mask to match bounding box mask = mask . resize (( x1 - x0 , y1 - y0 ), Image . Resampling . BILINEAR ) # Convert mask to numpy array for processing mask_array = np . array ( mask ) # Create overlay for this mask overlay = Image . new ( 'RGBA' , im . size , ( 0 , 0 , 0 , 0 )) overlay_draw = ImageDraw . Draw ( overlay ) # Create overlay for the mask color = ( 255 , 255 , 255 , 200 ) for y in range ( y0 , y1 ): for x in range ( x0 , x1 ): if mask_array [ y - y0 , x - x0 ] > 128 : # Threshold for mask overlay_draw . point (( x , y ), fill = color ) # Save individual mask and its overlay mask_filename = f " { item [ 'label' ] } _ { i } _mask.png" \ No newline at end of file diff --git a/docstore/cc9cea37-ee79-45b9-bff1-fd06c52abcf0 b/docstore/cc9cea37-ee79-45b9-bff1-fd06c52abcf0 new file mode 100644 index 0000000000000000000000000000000000000000..b99824a0bb181cb1be6367ec11bfeefdd4ec4b3d --- /dev/null +++ b/docstore/cc9cea37-ee79-45b9-bff1-fd06c52abcf0 @@ -0,0 +1 @@ +AUDIO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Describe this audio clip"}, {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": ' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json To learn more about working with media files, see Files API . Pass audio data inline Instead of uploading an audio file, you can pass inline audio data in the request to generateContent : Python from google.genai import types with open ( 'path/to/small-sample.mp3' , 'rb' ) as f : audio_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ 'Describe this audio clip' , types . Part . from_bytes ( data = audio_bytes , mime_type = 'audio/mp3' , ) ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const base64AudioFile = fs . readFileSync ( "path/to/small-sample.mp3" , { encoding : "base64" , }); const contents = [ { text : "Please summarize the audio." }, { inlineData : { mimeType : "audio/mp3" , data : base64AudioFile , }, }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } audioBytes , _ := os . ReadFile ( "/path/to/small-sample.mp3" ) parts := [] * genai . Part { genai . NewPartFromText ( "Describe this audio clip" ), & genai . \ No newline at end of file diff --git a/docstore/cca6ad6c-033e-4765-b4f6-1a718868be05 b/docstore/cca6ad6c-033e-4765-b4f6-1a718868be05 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/cca6ad6c-033e-4765-b4f6-1a718868be05 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/ccd4077c-c883-46a9-9f94-f8bff281e315 b/docstore/ccd4077c-c883-46a9-9f94-f8bff281e315 new file mode 100644 index 0000000000000000000000000000000000000000..951343b7e154dfc2bc312960fc158259d984c283 --- /dev/null +++ b/docstore/ccd4077c-c883-46a9-9f94-f8bff281e315 @@ -0,0 +1 @@ +client . aio . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'Tell me a story in 300 words.' ) Chat Start a chat and send a message to the model: Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) chat = model . start_chat () response = chat . send_message ( "Tell me a story in 100 words" ) response = chat . send_message ( "What happened after that?" ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); const chat = model . startChat ({ history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); let result = await chat . sendMessage ( "I have 2 dogs in my house." ); console . log ( result . response . text ()); result = await chat . sendMessage ( "How many paws are in my house?" ); console . log ( result . response . text ()); Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( "gemini-1.5-flash" ) cs := model . StartChat () cs . History = [] * genai . Content { { Parts : [] genai . Part { genai . Text ( "Hello, I have 2 dogs in my house." ), }, Role : "user" , }, { Parts : [] genai . Part { genai . Text ( "Great to meet you. What would you like to know?" ), }, Role : "model" , }, } res , err := cs . SendMessage ( ctx , genai . Text ( "How many paws are in my house?" )) if err != nil { log . Fatal ( err ) } printResponse ( res ) // utility for printing the response After Python from google import genai client = genai . Client () chat = client . chats . create ( model = 'gemini-2.0-flash' ) response = chat . send_message ( message = 'Tell me a story in 100 words' ) response = \ No newline at end of file diff --git a/docstore/ccda5303-2503-4fca-bdc6-ca592b49d52c b/docstore/ccda5303-2503-4fca-bdc6-ca592b49d52c new file mode 100644 index 0000000000000000000000000000000000000000..b73659061f0ce2830a1e6cf67f6a74b5cc699bc6 --- /dev/null +++ b/docstore/ccda5303-2503-4fca-bdc6-ca592b49d52c @@ -0,0 +1 @@ +"turn_off_the_lights" } tools = [{ "function_declarations" : [ turn_on_the_lights , turn_off_the_lights ]}] config = { "response_modalities" : [ "TEXT" ], "tools" : tools } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : prompt = "Turn on the lights please" await session . send_client_content ( turns = { "parts" : [{ "text" : prompt }]}) async for chunk in session . receive (): if chunk . server_content : if chunk . text is not None : print ( chunk . text ) elif chunk . tool_call : function_responses = [] for fc in chunk . tool_call . function_calls : function_response = types . FunctionResponse ( id = fc . id , name = fc . name , response = { "result" : "ok" } # simple, hard-coded function response ) function_responses . append ( function_response ) await session . send_tool_response ( function_responses = function_responses ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; // Simple function definitions const turn_on_the_lights = { name : "turn_on_the_lights" } // , description: '...', parameters: { ... } const turn_off_the_lights = { name : "turn_off_the_lights" } const tools = [{ functionDeclarations : [ turn_on_the_lights , turn_off_the_lights ] }] const config = { responseModalities : [ Modality . TEXT ], tools : tools } async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { \ No newline at end of file diff --git a/docstore/ccffefb6-0695-4a2d-a84b-451d92a0d105 b/docstore/ccffefb6-0695-4a2d-a84b-451d92a0d105 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/ccffefb6-0695-4a2d-a84b-451d92a0d105 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/cd1c216d-c7a7-43f6-a11e-88ffe87140c5 b/docstore/cd1c216d-c7a7-43f6-a11e-88ffe87140c5 new file mode 100644 index 0000000000000000000000000000000000000000..9dd2717ee06a5cb6666e2976e86c3492640fe1f5 --- /dev/null +++ b/docstore/cd1c216d-c7a7-43f6-a11e-88ffe87140c5 @@ -0,0 +1 @@ +'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Refer to the GenerateContentConfig in our API reference for a complete list of configurable parameters and their descriptions. Multimodal inputs The Gemini API supports multimodal inputs, allowing you to combine text with media files. The following example demonstrates providing an image: Python from PIL import Image from google import genai client = genai . Client () image = Image . open ( "/path/to/organ.png" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , "Tell me about this instrument" ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const image = await ai . files . upload ({ file : "/path/to/organ.png" , }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ createUserContent ([ "Tell me about this instrument" , createPartFromUri ( image . uri , image . mimeType ), ]), ], }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imagePath := "/path/to/organ.jpg" imgData , _ := os . ReadFile ( imagePath ) parts := [] * genai . Part { genai . NewPartFromText ( "Tell me about this instrument" ), & genai . Part { InlineData : & genai . Blob { MIMEType : "image/jpeg" , Data : imgData , \ No newline at end of file diff --git a/docstore/cd21f606-6972-4724-a939-3e0abe6902da b/docstore/cd21f606-6972-4724-a939-3e0abe6902da new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/cd21f606-6972-4724-a939-3e0abe6902da @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/cd2694d8-16f3-488f-833b-a129d79a5123 b/docstore/cd2694d8-16f3-488f-833b-a129d79a5123 new file mode 100644 index 0000000000000000000000000000000000000000..c72243f466a39b8c76a4237da0daa8b99ecbbe13 --- /dev/null +++ b/docstore/cd2694d8-16f3-488f-833b-a129d79a5123 @@ -0,0 +1 @@ +}, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST # Use a temporary file to hold the base64 encoded image data TEMP_B64 = $( mktemp ) trap 'rm -f "$TEMP_B64"' EXIT base64 $B64FLAGS $IMG_PATH > " $TEMP_B64 " # Use a temporary file to hold the JSON payload TEMP_JSON = $( mktemp ) trap 'rm -f "$TEMP_JSON"' EXIT cat > " $TEMP_JSON " << EOF { "contents" : [ { "parts" : [ { "text" : "Tell me about this instrument" } , { "inline_data" : { "mime_type" : "image/jpeg" , "data" : " $( cat " $TEMP_B64 " ) " } } ] } ] } EOF curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d "@ $TEMP_JSON " Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const imageUrl = 'http://image/url' ; const image = getImageData ( imageUrl ); const payload = { contents : [ { parts : [ { image }, { text : 'Tell me about this instrument' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } function getImageData ( url ) { const blob = UrlFetchApp . fetch ( url ). getBlob (); return { mimeType : blob . getContentType (), data : Utilities . base64Encode ( blob . getBytes ()) }; } \ No newline at end of file diff --git a/docstore/cd3fb9b8-b6b2-4bde-bad6-f9e59d2e1bc5 b/docstore/cd3fb9b8-b6b2-4bde-bad6-f9e59d2e1bc5 new file mode 100644 index 0000000000000000000000000000000000000000..c1215478fcfc0a791e03023378a8264733510dd8 --- /dev/null +++ b/docstore/cd3fb9b8-b6b2-4bde-bad6-f9e59d2e1bc5 @@ -0,0 +1 @@ +connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.pcm" ); const base64Audio = Buffer . from ( fileBuffer ). toString ( 'base64' ); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); // if stream gets paused, send: // session.sendRealtimeInput({ audioStreamEnd: true }) const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); With send_realtime_input , the API will respond to audio automatically based on VAD. While send_client_content adds messages to the model context in order, send_realtime_input is optimized for responsiveness at the expense of deterministic ordering. Automatic VAD configuration For more control over the VAD activity, you can configure the following parameters. See API reference for more info. Python from google.genai import types config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : False , # default "start_of_speech_sensitivity" : types . StartSensitivity . START_SENSITIVITY_LOW , "end_of_speech_sensitivity" : types . EndSensitivity . END_SENSITIVITY_LOW , "prefix_padding_ms" : 20 , "silence_duration_ms" : 100 , } } } JavaScript import { GoogleGenAI , Modality , StartSensitivity , EndSensitivity } from '@google/genai' ; const config = { responseModalities : [ Modality . \ No newline at end of file diff --git a/docstore/cd4e8151-c679-4f83-9363-9683ea227e79 b/docstore/cd4e8151-c679-4f83-9363-9683ea227e79 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/cd4e8151-c679-4f83-9363-9683ea227e79 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/cd613d92-c78e-4f6b-9985-d2e6da6d0870 b/docstore/cd613d92-c78e-4f6b-9985-d2e6da6d0870 new file mode 100644 index 0000000000000000000000000000000000000000..6b5570dc552776eef13cf8339199673fd1c28eb5 --- /dev/null +++ b/docstore/cd613d92-c78e-4f6b-9985-d2e6da6d0870 @@ -0,0 +1 @@ +Generate an image Note: Image generation is only available in the paid tier. Generate an image: Python import base64 from openai import OpenAI from PIL import Image from io import BytesIO client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" , ) response = client . images . generate ( model = "imagen-3.0-generate-002" , prompt = "a portrait of a sheepadoodle wearing a cape" , response_format = 'b64_json' , n = 1 , ) for image_data in response . data : image = Image . open ( BytesIO ( base64 . b64decode ( image_data . b64_json ))) image . show () JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" , }); async function main () { const image = await openai . images . generate ( { model : "imagen-3.0-generate-002" , prompt : "a portrait of a sheepadoodle wearing a cape" , response_format : "b64_json" , n : 1 , } ); console . log ( image . data ); } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/images/generations" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "imagen-3.0-generate-002", "prompt": "a portrait of a sheepadoodle wearing a cape", "response_format": "b64_json", "n": 1, }' Audio understanding Analyze audio input: Python import base64 from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) with open ( "/path/to/your/audio/file.wav" , "rb" ) as audio_file : base64_audio = base64 . b64encode ( audio_file . read ()) . decode ( 'utf-8' ) response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = [ { "role" : "user" , "content" : [ { "type" : "text" , "text" : "Transcribe this audio" , }, { "type" : "input_audio" , "input_audio" : { "data" : base64_audio , "format" : "wav" } } ], } ], ) print \ No newline at end of file diff --git a/docstore/cd616666-d0d7-4d43-b206-59a23ff9c46e b/docstore/cd616666-d0d7-4d43-b206-59a23ff9c46e new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/cd616666-d0d7-4d43-b206-59a23ff9c46e @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/cd618e26-21d8-4b1e-91f8-db012f38009c b/docstore/cd618e26-21d8-4b1e-91f8-db012f38009c new file mode 100644 index 0000000000000000000000000000000000000000..c1222b1eb00e14a7d2a482f186a5d8fda014fef3 --- /dev/null +++ b/docstore/cd618e26-21d8-4b1e-91f8-db012f38009c @@ -0,0 +1 @@ +person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own?` ; let thoughts = "" ; let answer = "" ; async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-pro" , contents : prompt , config : { thinkingConfig : { includeThoughts : true , }, }, }); for await ( const chunk of response ) { for ( const part of chunk . candidates [ 0 ]. content . parts ) { if ( ! part . text ) { continue ; } else if ( part . thought ) { if ( ! thoughts ) { console . log ( "Thoughts summary:" ); } console . log ( part . text ); thoughts = thoughts + part . text ; } else { if ( ! answer ) { console . log ( "Answer:" ); } console . log ( part . text ); answer = answer + part . text ; } } } } await main (); Go package main import ( "context" "fmt" "log" "os" "google.golang.org/genai" ) const prompt = ` Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own? ` func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := genai . Text ( prompt ) model := "gemini-2.5-pro" resp := client . Models . GenerateContentStream ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { IncludeThoughts : true , }, }) for chunk := range resp { for _ , part := range chunk . Candidates [ 0 ]. Content . Parts { if len ( part . Text ) == 0 { continue } if part . Thought { fmt . Printf ( "Thought: %s\n" , part . Text ) } else { fmt . Printf ( "Answer: %s\n" , part . Text ) } } } } Thought signatures \ No newline at end of file diff --git a/docstore/cd913065-8afa-4ffa-a3d7-a902de8c8186 b/docstore/cd913065-8afa-4ffa-a3d7-a902de8c8186 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/cd913065-8afa-4ffa-a3d7-a902de8c8186 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/cd9b752a-22e7-44c6-8916-997d66a47b00 b/docstore/cd9b752a-22e7-44c6-8916-997d66a47b00 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/cd9b752a-22e7-44c6-8916-997d66a47b00 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/cd9dd831-8df4-4c56-a3d8-3e1b424f17d8 b/docstore/cd9dd831-8df4-4c56-a3d8-3e1b424f17d8 new file mode 100644 index 0000000000000000000000000000000000000000..3d93e98c2388d03b02165eba8a61d6ed193d8415 --- /dev/null +++ b/docstore/cd9dd831-8df4-4c56-a3d8-3e1b424f17d8 @@ -0,0 +1 @@ +Prompt design strategies | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Prompt design strategies Prompt design is the process of creating prompts, or natural language requests, that elicit accurate, high quality responses from a language model. This page introduces basic concepts, strategies, and best practices to get you started designing prompts to get the most out of Gemini AI models. Topic-specific prompt guides Looking for more specific prompt strategies? Check out our other prompting guides on: Prompting with media files Prompting for image generation Prompting for video generation Google AI Studio also hosts a prompt gallery meant to interactively showcase many of the concepts shared in this guide. Clear and specific instructions An effective and efficient way to customize model behavior is to provide it with clear and specific instructions. Instructions can be in the form of a question, step-by-step tasks, or as complex as mapping out a user's experience and mindset. Input Input is the required text in the prompt that you want the model to provide a response to. Inputs can be a question that the model answers (question input), a task the model performs (task input), an entity the model operates on (entity input), or partial input that the model completes or continues (completion input). Input type Prompt Generated output Question What's a good name for a flower shop that specializes in selling bouquets of dried flowers? Create a list of 5 options with just the names. Here are 10 names for a flower shop specializing in dried flowers: 1. Everlasting Blooms 2. Dried & Delightful 3. The Petal Preserve 4. Whispers of Wildflowers \ No newline at end of file diff --git a/docstore/cdb38cfc-8bac-47c7-a1b8-9665059e73de b/docstore/cdb38cfc-8bac-47c7-a1b8-9665059e73de new file mode 100644 index 0000000000000000000000000000000000000000..4b3d79fcf31020903c40df052c1807fd4a690d51 --- /dev/null +++ b/docstore/cdb38cfc-8bac-47c7-a1b8-9665059e73de @@ -0,0 +1 @@ +like photography descriptors, shapes and materials, historical art movements, and image quality modifiers. Photography Prompt includes: "A photo of..." To use this style, start with using keywords that clearly tell Imagen that you're looking for a photograph. Start your prompts with "A photo of. . ." . For example: Prompt: A photo of coffee beans in a kitchen on a wooden surface Prompt: A photo of a chocolate bar on a kitchen counter Prompt: A photo of a modern building with water in the background Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Photography modifiers In the following examples, you can see several photography-specific modifiers and parameters. You can combine multiple modifiers for more precise control. Camera Proximity - Close up, taken from far away Prompt: A close-up photo of coffee beans Prompt: A zoomed out photo of a small bag of coffee beans in a messy kitchen Camera Position - aerial, from below Prompt: aerial photo of urban city with skyscrapers Prompt: A photo of a forest canopy with blue skies from below Lighting - natural, dramatic, warm, cold Prompt: studio photo of a modern arm chair, natural lighting Prompt: studio photo of a modern arm chair, dramatic lighting Camera Settings - motion blur, soft focus, bokeh, portrait Prompt: photo of a city with skyscrapers from the inside of a car with motion blur Prompt: soft focus photograph of a bridge in an urban city at night Lens types - 35mm, 50mm, fisheye, wide angle, macro Prompt: photo of a leaf, macro lens Prompt: street photography, new york city, fisheye lens Film types - black and white, polaroid Prompt: a polaroid portrait of a dog wearing sunglasses Prompt: black and white photo of a dog wearing sunglasses Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Illustration and art Prompt includes: "A painting of..." , "A sketch of..." Art styles vary from monochrome styles like pencil \ No newline at end of file diff --git a/docstore/cdd0ad9e-454b-472b-8381-2c0886551103 b/docstore/cdd0ad9e-454b-472b-8381-2c0886551103 new file mode 100644 index 0000000000000000000000000000000000000000..cc967ffee72bfb803a27c22b45a7304a203928ac --- /dev/null +++ b/docstore/cdd0ad9e-454b-472b-8381-2c0886551103 @@ -0,0 +1 @@ +the model to parse the problem before answering the prompt. Prompt Response Parse the formula in the image first. Then based on the formula, answer "what's the 4th term in the sequence?". Click to expand the result $$ b(n) = b(n-1) * (-3) $$ $$ b(1) = 15 $$ Substitute n = 4 into the formula: $$ b(4) = b(4-1) * (-3) $$ $$ b(4) = b(3) * (-3) $$ We don't know the value of b(3), so substitute n = 3 into the formula: $$ b(3) = b(3-1) * (-3) $$ $$ b(3) = b(2) * (-3) $$ We don't know the value of b(2), so substitute n = 2 into the formula: $$ b(2) = b(2-1) * (-3) $$ $$ b(2) = b(1) * (-3) $$ We know the value of b(1) is 15, so substitute it into the formula: $$ b(2) = 15 * (-3) $$ $$ b(2) = -45 $$ Now that we know the value of b(2), substitute it into the formula for b(3): $$ b(3) = b(2) * (-3) $$ $$ b(3) = -45 * (-3) $$ $$ b(3) = 135 $$ Now that we know the value of b(3), substitute it into the formula for b(4): $$ b(4) = b(3) * (-3) $$ $$ b(4) = 135 * (-3) $$ $$ b(4) = -405 $$ Therefore, the 4th term in the sequence is -405. This model response is correct - now to make the response even more streamlined, you can ask the model to to think step-by-step. Updated prompt Improved response What is the 4th term in the sequence? Think step by step. The first term is 15. The second term is b(1) * (-3) = 15 * (-3) = -45. The third term is b(2) * (-3) = -45 * (-3) = 135. The fourth term is b(3) * (-3) = 135 * (-3) = -405. Specify the output format A common problem is the need for model output to be in a certain format (e.g. markdown, JSON, HTML), especially if the model output needs to be ingested by a downstream task. You can try to produce output in that format by instructing the model to do so within the prompt. Below are two examples: Prompt Model response Parse the table in this image into markdown format | Subject | Status | Time | Start date | Due on | | ------- | ------ | ---- | ---------- | ------ | | Math | Done | 1.5 hrs | 11/29/2023 | 12/1/2023 | | Geography | Done | 30 \ No newline at end of file diff --git a/docstore/cdd576f6-1d0d-4ad5-bb40-d0a1b8dc1bbd b/docstore/cdd576f6-1d0d-4ad5-bb40-d0a1b8dc1bbd new file mode 100644 index 0000000000000000000000000000000000000000..a6b24f2239a055f1d65a9fc8ecf3b6c3213634db --- /dev/null +++ b/docstore/cdd576f6-1d0d-4ad5-bb40-d0a1b8dc1bbd @@ -0,0 +1 @@ +min | 12/2/2023 | 12/4/2023 | | Physics | In progress | 2 hrs | 12/3/2023 | 12/5/2023 | | Biology | Not started | 6 hrs | 11/2/2016 | 12/6/2023 | Prompt Model response Provide a list of all the following attributes: ingredients, type of cuisine, vegetarian or not, in JSON format ```json { "ingredients": [ "rice", "shrimp", "clams", "mussels", "peas", "tomatoes", "onions", "garlic", "olive oil", "paprika", "salt", "pepper" ], "type of cuisine": "Spanish", "vegetarian": false } ``` Put your image first for single-image prompts While Gemini can interpret image and text in any order within a prompt, placing a single image before the text prompt might lead to better results. In the following samples, you'll notice the image comes first before the text in each prompt. Troubleshooting your multimodal prompt You might need to troubleshoot your prompt if you are not getting a helpful response. Here are a few strategies you could try. If the model is not drawing information from the relevant part of the image To get a more specific response, you can point out which aspects of the image you want the prompt to draw information from to inform its response. Prompt Model response How many days will these diapers last a baby? The diapers will last for 198 days before they run out. Updated prompt Improved response How long will these diapers last before I run out? Use the weight shown on the box to determine the child's age, and use the total number of diapers in the box. Divide the total number by how many diapers the child goes through per day. The box contains 198 diapers. The box states the diapers are for a child weighing 22-37 lbs. The average child in this weight range is likely between 1 and 2 years old. A 2-year-old typically goes through 6 diapers per day, therefore, the diapers will last around 33 days (198/6). If the model output is too generic and not tailored enough to the image input To help the model tailor its response to the image(s), try asking it to describe the \ No newline at end of file diff --git a/docstore/cde5d4fd-4c3d-4d35-964d-ab90d0d4f7b9 b/docstore/cde5d4fd-4c3d-4d35-964d-ab90d0d4f7b9 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/cde5d4fd-4c3d-4d35-964d-ab90d0d4f7b9 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/cdf5d538-6e7d-4f19-b8f9-7999e22d5636 b/docstore/cdf5d538-6e7d-4f19-b8f9-7999e22d5636 new file mode 100644 index 0000000000000000000000000000000000000000..ffa55cd17dc266b0e00c821779e2850dd473d215 --- /dev/null +++ b/docstore/cdf5d538-6e7d-4f19-b8f9-7999e22d5636 @@ -0,0 +1 @@ +"Error: { batch_job . error } " ) Retrieving results Once the job status indicates your batch job has succeeded, the results are available in the response field. Python import json # Use the name of the job you want to check # e.g., inline_batch_job.name from the previous step job_name = "YOUR_BATCH_JOB_NAME" batch_job = client . batches . get ( name = job_name ) if batch_job . state . name == 'JOB_STATE_SUCCEEDED' : # If batch job was created with a file if batch_job . dest and batch_job . dest . file_name : # Results are in a file result_file_name = batch_job . dest . file_name print ( f "Results are in file: { result_file_name } " ) print ( "Downloading result file content..." ) file_content = client . files . download ( file = result_file_name ) # Process file_content (bytes) as needed print ( file_content . decode ( 'utf-8' )) # If batch job was created with inline request elif batch_job . dest and batch_job . dest . inlined_responses : # Results are inline print ( "Results are inline:" ) for i , inline_response in enumerate ( batch_job . dest . inlined_responses ): print ( f "Response { i + 1 } :" ) if inline_response . response : # Accessing response, structure may vary. try : print ( inline_response . response . text ) except AttributeError : print ( inline_response . response ) # Fallback elif inline_response . error : print ( f "Error: { inline_response . error } " ) else : print ( "No results found (neither file nor inline)." ) else : print ( f "Job did not succeed. Final state: { batch_job . state . name } " ) if batch_job . error : print ( f "Error: { batch_job . error } " ) REST BATCH_NAME = "batches/123456" # Your batch job name curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type:application/json" 2 > /dev/null > batch_status.json if jq -r '.done' batch_status.json | grep -q "false" ; then echo "Batch has not finished processing" fi batch_state = $( jq -r '.metadata.state' \ No newline at end of file diff --git a/docstore/ce048863-85fc-4498-813d-52691d544059 b/docstore/ce048863-85fc-4498-813d-52691d544059 new file mode 100644 index 0000000000000000000000000000000000000000..1b590529498df147a73d1ae4a22f439d08a36cb6 --- /dev/null +++ b/docstore/ce048863-85fc-4498-813d-52691d544059 @@ -0,0 +1 @@ +from "node:fs" ; const ai = new GoogleGenAI ({}); const base64ImageFile = fs . readFileSync ( "path/to/small-sample.jpg" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "image/jpeg" , data : base64ImageFile , }, }, { text : "Caption this image." }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go bytes , _ := os . ReadFile ( "path/to/small-sample.jpg" ) parts := [] * genai . Part { genai . NewPartFromBytes ( bytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST IMG_PATH = "/path/to/your/image1.jpg" if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "' " $( base64 $B64FLAGS $IMG_PATH ) " '" } }, {"text": "Caption this image."}, ] }] }' 2 > /dev/null You can also fetch an image from a URL, convert it to bytes, and pass it to generateContent as shown in the following examples. Python from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests . get ( image_path ) . content image = types . Part . from_bytes ( data = image_bytes , mime_type = "image/jpeg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is this image?" , image ], ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; \ No newline at end of file diff --git a/docstore/ce191424-f68b-400f-87cb-a1b3eecdb4b8 b/docstore/ce191424-f68b-400f-87cb-a1b3eecdb4b8 new file mode 100644 index 0000000000000000000000000000000000000000..9bdceff10febb2c20e655c9c51c8df9e3aef858d --- /dev/null +++ b/docstore/ce191424-f68b-400f-87cb-a1b3eecdb4b8 @@ -0,0 +1 @@ +Using different words or phrasing in your prompts often yields different responses from the model even though they all mean the same thing. If you're not getting the expected results from your prompt, try rephrasing it. Version 1: How do I bake a pie? Version 2: Suggest a recipe for a pie. Version 3: What's a good pie recipe? Switch to an analogous task: If you can't get the model to follow your instructions for a task, try giving it instructions for an analogous task that achieves the same result. This prompt tells the model to categorize a book by using predefined categories: Prompt: Which category does The Odyssey belong to: thriller sci-fi mythology biography Response: The Odyssey belongs to the category of **mythology**. Here's why: * **Mythology:** The Odyssey tells the story of Odysseus, a hero from Greek mythology, and his journey home after the Trojan War. It features gods, monsters, and supernatural events common to Greek mythology. ..... (gemini-2.5-flash) The response is correct, but the model didn't stay within the bounds of the options. You also want to model to just respond with one of the options instead of in a full sentence. In this case, you can rephrase the instructions as a multiple choice question and ask the model to choose an option. Prompt: Multiple choice problem: Which of the following options describes the book The Odyssey? Options: thriller sci-fi mythology biography Response: The correct answer is mythology . (gemini-2.5-flash) Change the order of prompt content: The order of the content in the prompt can sometimes affect the response. Try changing the content order and see how that affects the response. Version 1 : [ examples ] [ context ] [ input ] Version 2 : [ input ] [ examples ] [ context ] Version 3 : [ examples ] [ input ] [ context ] Fallback responses A fallback response is a response returned by the model when either the prompt or the response triggers a safety filter. An example of a fallback response is "I'm not able to \ No newline at end of file diff --git a/docstore/ce20e730-f2b3-4166-8197-87b775721594 b/docstore/ce20e730-f2b3-4166-8197-87b775721594 new file mode 100644 index 0000000000000000000000000000000000000000..4d5e24b23445eada240041ce046d4864e5df3992 --- /dev/null +++ b/docstore/ce20e730-f2b3-4166-8197-87b775721594 @@ -0,0 +1 @@ +anything with a lower probability is allowed. Threshold (Google AI Studio) Threshold (API) Description Block none BLOCK_NONE Always show regardless of probability of unsafe content Block few BLOCK_ONLY_HIGH Block when high probability of unsafe content Block some BLOCK_MEDIUM_AND_ABOVE Block when medium or high probability of unsafe content Block most BLOCK_LOW_AND_ABOVE Block when low, medium or high probability of unsafe content N/A HARM_BLOCK_THRESHOLD_UNSPECIFIED Threshold is unspecified, block using default threshold If the threshold is not set, the default block threshold is Block none (for gemini-1.5-pro-002 and gemini-1.5-flash-002 and all newer stable GA models) or Block some (in all other models) for all categories except the Civic integrity category. The default block threshold for the Civic integrity category is Block none (for gemini-2.0-flash-001 aliased as gemini-2.0-flash , gemini-2.0-pro-exp-02-05 , and gemini-2.0-flash-lite ) both for Google AI Studio and the Gemini API, and Block most for all other models in Google AI Studio only. You can set these settings for each request you make to the generative service. See the HarmBlockThreshold API reference for details. Safety feedback generateContent returns a GenerateContentResponse which includes safety feedback. Prompt feedback is included in promptFeedback . If promptFeedback.blockReason is set, then the content of the prompt was blocked. Response candidate feedback is included in Candidate.finishReason and Candidate.safetyRatings . If response content was blocked and the finishReason was SAFETY , you can inspect safetyRatings for more details. The content that was blocked is not returned. Adjust safety settings This section covers how to adjust the safety settings in both Google AI Studio and in your code. Google AI Studio You can adjust safety settings in Google AI Studio, but you cannot turn them off. Click Edit safety settings in the Run settings panel to open the Run safety settings modal. In \ No newline at end of file diff --git a/docstore/ce2598f5-a6cc-4d46-907b-4ac2e43fb293 b/docstore/ce2598f5-a6cc-4d46-907b-4ac2e43fb293 new file mode 100644 index 0000000000000000000000000000000000000000..e7df3841ca47d093238d5ecfeb9f7cc95942f8e5 --- /dev/null +++ b/docstore/ce2598f5-a6cc-4d46-907b-4ac2e43fb293 @@ -0,0 +1 @@ +data image2Path := "path/to/image2.jpeg" imgBytes , _ := os . ReadFile ( image2Path ) parts := [] * genai . Part { genai . NewPartFromText ( "What is different between these two images?" ), genai . NewPartFromBytes ( imgBytes , "image/jpeg" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST # Upload the first image IMAGE1_PATH = "path/to/image1.jpg" MIME1_TYPE = $( file -b --mime-type " ${ IMAGE1_PATH } " ) NUM1_BYTES = $( wc -c < " ${ IMAGE1_PATH } " ) DISPLAY_NAME1 = IMAGE1 tmp_header_file1 = upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME1_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME1 } '}}" 2 > /dev/null upload_url1 = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file1 } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file1 } " curl " ${ upload_url1 } " \ -H "Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE1_PATH } " 2 > /dev/null > file_info1.json file1_uri = $( jq ".file.uri" file_info1.json ) echo file1_uri = $file1_uri # Prepare the second image (inline) IMAGE2_PATH = "path/to/image2.png" MIME2_TYPE = $( file -b --mime-type " ${ IMAGE2_PATH } " ) if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMAGE2_BASE64 = $( base64 $B64FLAGS $IMAGE2_PATH ) # Now generate content using both images curl \ No newline at end of file diff --git a/docstore/ce2ca3c2-60a9-418b-adab-4ad397b933a6 b/docstore/ce2ca3c2-60a9-418b-adab-4ad397b933a6 new file mode 100644 index 0000000000000000000000000000000000000000..5517c0bf6b0252fb1080acbdd22b2b409d2663d8 --- /dev/null +++ b/docstore/ce2ca3c2-60a9-418b-adab-4ad397b933a6 @@ -0,0 +1 @@ +signatures, function call and result of the function execution to contents function_call_content = response . candidates [ 0 ] . content # Append the model's function call message, which includes thought signatures contents . append ( function_call_content ) contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Step 4: Create user friendly response with function result and call the model again // ...Create a function response part (No change) // Append thought signatures, function call and result of the function execution to contents const function_response_content = response . candidates [ 0 ]. content ; contents . push ( function_response_content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); The following shows what a request returning a thought signature may look like: [{ "contents" : [ { "role" : "user" , "parts" : [ { "text" : "what is the weather in Lake Tahoe?" } ] } , { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/ce439532-3f85-471f-8a7a-7f6c14a1b104 b/docstore/ce439532-3f85-471f-8a7a-7f6c14a1b104 new file mode 100644 index 0000000000000000000000000000000000000000..f1254740842b565388da4397f822e545092a5b88 --- /dev/null +++ b/docstore/ce439532-3f85-471f-8a7a-7f6c14a1b104 @@ -0,0 +1 @@ +NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) stream := chat . SendMessageStream ( ctx , genai . Part { Text : "How many paws are in my house?" }) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt = sse \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Supported models All models in the Gemini family support text generation. To learn more about the models \ No newline at end of file diff --git a/docstore/ce8781e1-f029-4c90-b5f2-281eddcab4e4 b/docstore/ce8781e1-f029-4c90-b5f2-281eddcab4e4 new file mode 100644 index 0000000000000000000000000000000000000000..6753b1d43ae21393f7ea777aad5afc4becddf540 --- /dev/null +++ b/docstore/ce8781e1-f029-4c90-b5f2-281eddcab4e4 @@ -0,0 +1 @@ +response will include a thought_signature field containing an encrypted representation of the model's reasoning. Return the signature: When you send the function's execution result back to the server, include the thought_signature you received. This allows the model to restore its previous thinking context and will likely result in better function calling performance. Receiving signatures from the server Signatures are returned in the part after the model's thinking phase, which typically is a text or function call. Here are some examples of what thought signatures look like returned in each type of part, in response to the request "What's the weather in Lake Tahoe?" using the Get Weather example: Text part [{ "candidates" : [ { "content" : { "parts" : [ { "text" : "Here's what the weather in Lake Tahoe is today" , "thoughtSignature" : "ClcBVKhc7ru7KzUI7SrdUoIdAYLm/+i93aHjfIt4xHyAoO/G70tApxnK2ujBhOhC1PrRy1pkQa88fqFvpHNVd1HDjNLO7mkp6/hFwE+SPPEB3fh0hs4oM8MKhgIBVKhc7uIGvrS7i/T4HpfbnYrluFfWNjZ62gewqe4cVdR/Dlh+zbjtYmDD0gPZ+SuBO7vvHQdzsjePRP+2Y5XddX6LEf/cGGgakq8EhVvw/a6IVzUO6XmpHg2Ag1sl8E9+VFH/lC0R0ZuYdFWligtDuYwp5p5q3o59G0TtWeU2MC1y2MJfE9u/KWd313ldka80/X2W/xF2O/4djMp5G2WKcULfve75zeRCy0mc5iS3SB9mTH0cT6x0vtKjeBx50gcg+CQWtJcRuwTVzz54dmvmK9xvnqA8gKGw3DuaM9wfy5hyY7Qg0z3iyyWdP8T/lbjKim8IEQOk7O1vVwP1Ko7oMYH8JgA1CsoBAVSoXO6v4c5RSyd1cn6EIU0pEFQsjW7rYWPuZdOFq/tsGJT9BCfW7KGkPGwlNSq8jTJFvbcJ/DjtndISQYXwiXd2kGa5JfdS2Kh4zOxCxiWtOk+2nCc3+XQk2nonhO+esGJpkDdbbHZSqRgcUtYKq7q28iPFOQvOFyCiZNB7K86Z/6Hnagu2snSlN/BcTMaFGaWpcCClSUo4foRZn3WbNCoM8rcpD7qEJMp4a5baaSxyyeL1ZTGd2HLpFys/oiW6e3oAnhxuIysCwg==" } ] , "role" : "model" } , "index" : 0 } ] , # Remainder of response... Function call part [{ "candidates" : [ { "content" : { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/ce98be5f-4668-4076-a2cd-46e1981b68bb b/docstore/ce98be5f-4668-4076-a2cd-46e1981b68bb new file mode 100644 index 0000000000000000000000000000000000000000..cd05fff49dc646621e4ad5455e6cddce9e307548 --- /dev/null +++ b/docstore/ce98be5f-4668-4076-a2cd-46e1981b68bb @@ -0,0 +1 @@ +models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents ) # See thought signatures for part in response . candidates [ 0 ] . content . parts : if part . thought_signature : print ( "Thought signature:" ) print ( part . thought_signature ) Returning signatures back to the server In order to return signatures back: You should return signatures along with their containing parts back to the server You shouldn't merge a part with a signature with another part which also contains a signature. The signature string is not concatenable You shouldn't merge one part with a signature with another part without a signature. This breaks the correct positioning of the thought represented by the signature. The code will remain the same as in Step 4 of the previous section. But in this case (as indicated in the comment below) you will return signatures to the model along with the result of the function execution so the model can incorporate the thoughts into its final response: Python # Step 4: Create user friendly response with function result and call the model again # ...Create a function response part (No change) # Append thought signatures, function call and result of the function execution to contents function_call_content = response . candidates [ 0 ] . content # Append the model's function call message, which includes thought signatures contents . append ( function_call_content ) contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Step 4: Create user friendly response with function result and call the model again // ...Create a function response part (No change) // Append thought signatures, function call and result of the function execution to contents const function_response_content = response . \ No newline at end of file diff --git a/docstore/cebab458-16d3-4ee7-8bb3-2f9d1c7b3e92 b/docstore/cebab458-16d3-4ee7-8bb3-2f9d1c7b3e92 new file mode 100644 index 0000000000000000000000000000000000000000..e8472db3f1b37b3dc2d13dd06406d0e42fc75dfb --- /dev/null +++ b/docstore/cebab458-16d3-4ee7-8bb3-2f9d1c7b3e92 @@ -0,0 +1 @@ +costing 258 tokens. Tips and best practices Verify that images are correctly rotated. Use clear, non-blurry images. When using a single image with text, place the text prompt after the image part in the contents array. What's next This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources: Files API : Learn more about uploading and managing files for use with Gemini. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/cec9663a-541a-4379-8f65-6837707e8b52 b/docstore/cec9663a-541a-4379-8f65-6837707e8b52 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/cec9663a-541a-4379-8f65-6837707e8b52 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/ceca8f67-c96f-4d72-ba49-733e02e491c5 b/docstore/ceca8f67-c96f-4d72-ba49-733e02e491c5 new file mode 100644 index 0000000000000000000000000000000000000000..8450328bfc49bb62b82cf9cb2ac9ca979a8f4eef --- /dev/null +++ b/docstore/ceca8f67-c96f-4d72-ba49-733e02e491c5 @@ -0,0 +1 @@ +Python import asyncio import wave from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : wf = wave . open ( "audio.wav" , "wb" ) wf . setnchannels ( 1 ) wf . setsampwidth ( 2 ) wf . setframerate ( 24000 ) message = "Hello how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . data is not None : wf . writeframes ( response . data ) # Un-comment this code to print audio data info # if response.server_content.model_turn is not None: # print(response.server_content.model_turn.parts[0].inline_data.mime_type) wf . close () if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : \ No newline at end of file diff --git a/docstore/ced51a78-1303-4c67-b2cd-d3facb3fd692 b/docstore/ced51a78-1303-4c67-b2cd-d3facb3fd692 new file mode 100644 index 0000000000000000000000000000000000000000..3c1b7a7f28e24faa230d65b7608a5733d7c64407 --- /dev/null +++ b/docstore/ced51a78-1303-4c67-b2cd-d3facb3fd692 @@ -0,0 +1 @@ +single prompt by including multiple image Part objects in the contents array. These can be a mix of inline data (local files or URLs) and File API references. Python from google import genai from google.genai import types client = genai . Client () # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client . files . upload ( file = image1_path ) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open ( image2_path , 'rb' ) as f : img2_bytes = f . read () # Create the prompt with text and multiple images response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is different between these two images?" , uploaded_file , # Use the uploaded file reference types . Part . from_bytes ( data = img2_bytes , mime_type = 'image/png' ) ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); async function main () { // Upload the first image const image1_path = "path/to/image1.jpg" ; const uploadedFile = await ai . files . upload ({ file : image1_path , config : { mimeType : "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png" ; const base64Image2File = fs . readFileSync ( image2_path , { encoding : "base64" , }); // Create the prompt with text and multiple images const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ "What is different between these two images?" , createPartFromUri ( uploadedFile . uri , uploadedFile . mimeType ), { inlineData : { mimeType : "image/png" , data : base64Image2File , }, }, ]), }); console . log ( response . text ); } await main (); Go // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile , _ := client . Files . UploadFromPath ( ctx , image1Path , nil ) // Prepare the second image as inline \ No newline at end of file diff --git a/docstore/cf2284d4-9416-4abb-928e-ff4a93226408 b/docstore/cf2284d4-9416-4abb-928e-ff4a93226408 new file mode 100644 index 0000000000000000000000000000000000000000..9b7a72ccaa4be8bd1d0ee10d4c849d278d1b90e5 --- /dev/null +++ b/docstore/cf2284d4-9416-4abb-928e-ff4a93226408 @@ -0,0 +1 @@ +the standard rate limits for GenerateContent apply, and token limits include cached tokens. The number of cached tokens is returned in the usage_metadata from the create, get, and list operations of the cache service, and also in GenerateContent when using the cache. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/cf6fd878-1b3c-4246-9cfe-cc5785f91a12 b/docstore/cf6fd878-1b3c-4246-9cfe-cc5785f91a12 new file mode 100644 index 0000000000000000000000000000000000000000..6d1737de72d3e84541423a1739a6d021fb7e324a --- /dev/null +++ b/docstore/cf6fd878-1b3c-4246-9cfe-cc5785f91a12 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/video#generate-from-text Title: Generate video using Veo | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/cf88c5ad-f977-460b-b94b-f3c8901441de b/docstore/cf88c5ad-f977-460b-b94b-f3c8901441de new file mode 100644 index 0000000000000000000000000000000000000000..f8f8fc78a25f955fbb787da7b8a31bd5dafcbc7e --- /dev/null +++ b/docstore/cf88c5ad-f977-460b-b94b-f3c8901441de @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/usage-policies#abuse-monitoring Title: Additional usage policies | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/cfb79898-5891-43be-add8-0d8f22017711 b/docstore/cfb79898-5891-43be-add8-0d8f22017711 new file mode 100644 index 0000000000000000000000000000000000000000..41c5d7c70c10b0c099f849b39a650a62d6333896 --- /dev/null +++ b/docstore/cfb79898-5891-43be-add8-0d8f22017711 @@ -0,0 +1 @@ +npm install @modelcontextprotocol/sdk Note: JavaScript supports automatic tool calling by wrapping the client with mcpToTool . If you want to disable it, you can provide automaticFunctionCalling with disabled true . import { GoogleGenAI , FunctionCallingConfigMode , mcpToTool } from '@google/genai' ; import { Client } from "@modelcontextprotocol/sdk/client/index.js" ; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js" ; // Create server parameters for stdio connection const serverParams = new StdioClientTransport ({ command : "npx" , // Executable args : [ "-y" , "@philschmid/weather-mcp" ] // MCP Server }); const client = new Client ( { name : "example-client" , version : "1.0.0" } ); // Configure the client const ai = new GoogleGenAI ({}); // Initialize the connection between client and server await client . connect ( serverParams ); // Send request to the model with MCP tools const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : `What is the weather in London in ${ new Date (). toLocaleDateString () } ?` , config : { tools : [ mcpToTool ( client )], // uses the session, will automatically call the tool // Uncomment if you **don't** want the sdk to automatically call the tool // automaticFunctionCalling: { // disable: true, // }, }, }); console . log ( response . text ) // Close the connection await client . close (); Limitations with built-in MCP support Built-in MCP support is a experimental feature in our SDKs and has the following limitations: Only tools are supported, not resources nor prompts It is available for the Python and JavaScript/TypeScript SDK. Breaking changes might occur in future releases. Manual integration of MCP servers is always an option if these limit what you're building. Supported models This section lists models and their function calling capabilities. Experimental models are not included. You can find a comprehensive capabilities overview on the model overview page. \ No newline at end of file diff --git a/docstore/cfd0b5ca-1fcc-4e6b-8f0a-1f9b67fe959c b/docstore/cfd0b5ca-1fcc-4e6b-8f0a-1f9b67fe959c new file mode 100644 index 0000000000000000000000000000000000000000..a8dcbba04f65c01483cc37e09a606af68459adcf --- /dev/null +++ b/docstore/cfd0b5ca-1fcc-4e6b-8f0a-1f9b67fe959c @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/function-calling#step-2 Title: Function calling with the Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/cfd79beb-cf0a-41b7-92b7-8ff157e81818 b/docstore/cfd79beb-cf0a-41b7-92b7-8ff157e81818 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/cfd79beb-cf0a-41b7-92b7-8ff157e81818 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/cfe8a3cb-3195-479b-b053-7e9548c88b0a b/docstore/cfe8a3cb-3195-479b-b053-7e9548c88b0a new file mode 100644 index 0000000000000000000000000000000000000000..c3dd5ccab4f70d7a2a8a4ffb78dc41dee62c1a3f --- /dev/null +++ b/docstore/cfe8a3cb-3195-479b-b053-7e9548c88b0a @@ -0,0 +1 @@ +with texts to make sense, use whatever order is most natural. Troubleshooting your multimodal prompt If the model is not drawing information from the relevant part of the image: Drop hints with which aspects of the image you want the prompt to draw information from. If the model output is too generic (not tailored enough to the image/video input): At the start of the prompt, try asking the model to describe the image(s) or video before providing the task instruction, or try asking the model to refer to what's in the image. To troubleshoot which part failed: Ask the model to describe the image, or ask the model to explain its reasoning, to gauge the model's initial understanding. If your prompt results in hallucinated content: Try dialing down the temperature setting or asking the model for shorter descriptions so that it's less likely to extrapolate additional details. Tuning the sampling parameters: Experiment with different temperature settings and top-k selections to adjust the model's creativity. Be specific in your instructions Prompts have the most success when they are clear and detailed. If you have a specific output in mind, it's better to include that requirement in the prompt to ensure you get the output you want. For this image of an airport board, asking the model to just "describe this image" could generate a general description. If you need the model to parse the time and city from the image, you can include that request directly in your prompt. Prompt Model response Describe this image. The image shows an airport arrivals and departures board. Updated prompt Improved response Parse the time and city from the airport board shown in this image into a list. 10:50 Moscow 11:05 Edinburgh 11:05 London 11:10 Bucharest 11:30 Kiev 11:35 Dublin 11:45 East Midlands 12:15 Sofia 12:30 London 12:30 Newcastle 12:40 St Petersburg 12:40 London 12:45 Manchester Add a few examples The Gemini model can accept multiple inputs which it can use as examples to understand \ No newline at end of file diff --git a/docstore/d00efd06-4245-413e-b224-bc21b6ec8ead b/docstore/d00efd06-4245-413e-b224-bc21b6ec8ead new file mode 100644 index 0000000000000000000000000000000000000000..8c60a97b59d947e95247d6e4ee3eb21605ab2ae3 --- /dev/null +++ b/docstore/d00efd06-4245-413e-b224-bc21b6ec8ead @@ -0,0 +1 @@ +open ( video_file_name , 'rb' ) . read () response = client . models . generate_content ( model = 'models/gemini-2.0-flash' , contents = types . Content ( parts = [ types . Part ( inline_data = types . Blob ( data = video_bytes , mime_type = 'video/mp4' ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const base64VideoFile = fs . readFileSync ( "path/to/small-sample.mp4" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "video/mp4" , data : base64VideoFile , }, }, { text : "Please summarize the video in 3 sentences." } ]; const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : contents , }); console . log ( response . text ); REST Note: If you get an Argument list too long error, the base64 encoding of your file might be too long for the curl command line. Use the File API method instead for larger files. VIDEO_PATH = /path/to/your/video.mp4 if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"video/mp4", "data": "' $( base64 $B64FLAGS $VIDEO_PATH ) '" } }, {"text": "Please summarize the video in 3 sentences."} ] }] }' 2 > /dev/null Include a YouTube URL Preview: The YouTube URL feature is in preview and is available at no charge. Pricing and rate limits are likely to change. The Gemini API and AI Studio support YouTube URLs as a file data Part . You can include a YouTube URL with a prompt asking the model to summarize, translate, or otherwise interact with the video content. Limitations: For the free tier, you can't upload more than 8 hours of \ No newline at end of file diff --git a/docstore/d07036b6-b051-4cee-b252-b9f166751fa6 b/docstore/d07036b6-b051-4cee-b252-b9f166751fa6 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/d07036b6-b051-4cee-b252-b9f166751fa6 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/d07bf270-ab06-4235-b2f6-c452eefe9f33 b/docstore/d07bf270-ab06-4235-b2f6-c452eefe9f33 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/d07bf270-ab06-4235-b2f6-c452eefe9f33 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/d089904b-dfde-41c8-9abb-63acec1a727c b/docstore/d089904b-dfde-41c8-9abb-63acec1a727c new file mode 100644 index 0000000000000000000000000000000000000000..4542d68c23b70ac5014267f60edbcffd2138877f --- /dev/null +++ b/docstore/d089904b-dfde-41c8-9abb-63acec1a727c @@ -0,0 +1 @@ +, headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Streaming can also be used for multi-turn conversations. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message_stream ( "I have 2 dogs in my house." ) for chunk in response : print ( chunk . text , end = "" ) response = chat . send_message_stream ( "How many paws are in my house?" ) for chunk in response : print ( chunk . text , end = "" ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const stream1 = await chat . sendMessageStream ({ message : "I have 2 dogs in my house." , }); for await ( const chunk of stream1 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } const stream2 = await chat . sendMessageStream ({ message : "How many paws are in my house?" , }); for await ( const chunk of stream2 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . \ No newline at end of file diff --git a/docstore/d096ea81-619f-4042-a57e-957e7ff3a788 b/docstore/d096ea81-619f-4042-a57e-957e7ff3a788 new file mode 100644 index 0000000000000000000000000000000000000000..70023e78d7b8c9459310371555beec5ba6328e28 --- /dev/null +++ b/docstore/d096ea81-619f-4042-a57e-957e7ff3a788 @@ -0,0 +1 @@ +results reflects a single function call that the model has requested. To send the results back, include the responses in the same order as they were requested. The Python SDK supports automatic function calling , which automatically converts Python functions to declarations, handles the function call execution and response cycle for you. Following is an example for the disco use case. Note: Automatic Function Calling is a Python SDK only feature at the moment. Python from google import genai from google.genai import types # Actual function implementations def power_disco_ball_impl ( power : bool ) - > dict : """Powers the spinning disco ball. Args: power: Whether to turn the disco ball on or off. Returns: A status dictionary indicating the current state. """ return { "status" : f "Disco ball powered { 'on' if power else 'off' } " } def start_music_impl ( energetic : bool , loud : bool ) - > dict : """Play some music matching the specified parameters. Args: energetic: Whether the music is energetic or not. loud: Whether the music is loud or not. Returns: A dictionary containing the music settings. """ music_type = "energetic" if energetic else "chill" volume = "loud" if loud else "quiet" return { "music_type" : music_type , "volume" : volume } def dim_lights_impl ( brightness : float ) - > dict : """Dim the lights. Args: brightness: The brightness of the lights, 0.0 is off, 1.0 is full. Returns: A dictionary containing the new brightness setting. """ return { "brightness" : brightness } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ power_disco_ball_impl , start_music_impl , dim_lights_impl ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Do everything you need to this place into party!" , config = config , ) print ( " \n Example 2: Automatic function calling" ) print ( response . text ) # I've turned on the disco ball, started playing loud and \ No newline at end of file diff --git a/docstore/d0e950cb-468b-4437-a84b-9b7f80d365a9 b/docstore/d0e950cb-468b-4437-a84b-9b7f80d365a9 new file mode 100644 index 0000000000000000000000000000000000000000..4b3d79fcf31020903c40df052c1807fd4a690d51 --- /dev/null +++ b/docstore/d0e950cb-468b-4437-a84b-9b7f80d365a9 @@ -0,0 +1 @@ +like photography descriptors, shapes and materials, historical art movements, and image quality modifiers. Photography Prompt includes: "A photo of..." To use this style, start with using keywords that clearly tell Imagen that you're looking for a photograph. Start your prompts with "A photo of. . ." . For example: Prompt: A photo of coffee beans in a kitchen on a wooden surface Prompt: A photo of a chocolate bar on a kitchen counter Prompt: A photo of a modern building with water in the background Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Photography modifiers In the following examples, you can see several photography-specific modifiers and parameters. You can combine multiple modifiers for more precise control. Camera Proximity - Close up, taken from far away Prompt: A close-up photo of coffee beans Prompt: A zoomed out photo of a small bag of coffee beans in a messy kitchen Camera Position - aerial, from below Prompt: aerial photo of urban city with skyscrapers Prompt: A photo of a forest canopy with blue skies from below Lighting - natural, dramatic, warm, cold Prompt: studio photo of a modern arm chair, natural lighting Prompt: studio photo of a modern arm chair, dramatic lighting Camera Settings - motion blur, soft focus, bokeh, portrait Prompt: photo of a city with skyscrapers from the inside of a car with motion blur Prompt: soft focus photograph of a bridge in an urban city at night Lens types - 35mm, 50mm, fisheye, wide angle, macro Prompt: photo of a leaf, macro lens Prompt: street photography, new york city, fisheye lens Film types - black and white, polaroid Prompt: a polaroid portrait of a dog wearing sunglasses Prompt: black and white photo of a dog wearing sunglasses Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Illustration and art Prompt includes: "A painting of..." , "A sketch of..." Art styles vary from monochrome styles like pencil \ No newline at end of file diff --git a/docstore/d1070ea2-6f93-4d8c-8a89-a8290475b21b b/docstore/d1070ea2-6f93-4d8c-8a89-a8290475b21b new file mode 100644 index 0000000000000000000000000000000000000000..c3dd5ccab4f70d7a2a8a4ffb78dc41dee62c1a3f --- /dev/null +++ b/docstore/d1070ea2-6f93-4d8c-8a89-a8290475b21b @@ -0,0 +1 @@ +with texts to make sense, use whatever order is most natural. Troubleshooting your multimodal prompt If the model is not drawing information from the relevant part of the image: Drop hints with which aspects of the image you want the prompt to draw information from. If the model output is too generic (not tailored enough to the image/video input): At the start of the prompt, try asking the model to describe the image(s) or video before providing the task instruction, or try asking the model to refer to what's in the image. To troubleshoot which part failed: Ask the model to describe the image, or ask the model to explain its reasoning, to gauge the model's initial understanding. If your prompt results in hallucinated content: Try dialing down the temperature setting or asking the model for shorter descriptions so that it's less likely to extrapolate additional details. Tuning the sampling parameters: Experiment with different temperature settings and top-k selections to adjust the model's creativity. Be specific in your instructions Prompts have the most success when they are clear and detailed. If you have a specific output in mind, it's better to include that requirement in the prompt to ensure you get the output you want. For this image of an airport board, asking the model to just "describe this image" could generate a general description. If you need the model to parse the time and city from the image, you can include that request directly in your prompt. Prompt Model response Describe this image. The image shows an airport arrivals and departures board. Updated prompt Improved response Parse the time and city from the airport board shown in this image into a list. 10:50 Moscow 11:05 Edinburgh 11:05 London 11:10 Bucharest 11:30 Kiev 11:35 Dublin 11:45 East Midlands 12:15 Sofia 12:30 London 12:30 Newcastle 12:40 St Petersburg 12:40 London 12:45 Manchester Add a few examples The Gemini model can accept multiple inputs which it can use as examples to understand \ No newline at end of file diff --git a/docstore/d1135d9c-ae45-4bc5-8bf4-005068d8d605 b/docstore/d1135d9c-ae45-4bc5-8bf4-005068d8d605 new file mode 100644 index 0000000000000000000000000000000000000000..76ace95fe58e8b11b8d859b0314962fe72bedd09 --- /dev/null +++ b/docstore/d1135d9c-ae45-4bc5-8bf4-005068d8d605 @@ -0,0 +1 @@ +() { const payload = { contents : [ { parts : [ { text : 'Explain how AI works' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Multi-turn conversations (Chat) Our SDKs provide functionality to collect multiple rounds of prompts and responses into a chat, giving you an easy way to keep track of the conversation history. Note: Chat functionality is only implemented as part of the SDKs. Behind the scenes, it still uses the generateContent API. For multi-turn conversations, the full conversation history is sent to the model with each follow-up turn. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message ( "I have 2 dogs in my house." ) print ( response . text ) response = chat . send_message ( "How many paws are in my house?" ) print ( response . text ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const response1 = await chat . sendMessage ({ message : "I have 2 dogs in my house." , }); console . log ( "Chat response 1:" , response1 . text ); const response2 = await chat . sendMessage ({ message : "How many paws are in \ No newline at end of file diff --git a/docstore/d13c389f-30e0-45f2-8b5a-9d608c85619d b/docstore/d13c389f-30e0-45f2-8b5a-9d608c85619d new file mode 100644 index 0000000000000000000000000000000000000000..7f1c9347d97993455b39cbb73da344d6e62c5076 --- /dev/null +++ b/docstore/d13c389f-30e0-45f2-8b5a-9d608c85619d @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/openai Title: OpenAI compatibility | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/d152c828-96b7-4a98-9ce1-1a6c3719c123 b/docstore/d152c828-96b7-4a98-9ce1-1a6c3719c123 new file mode 100644 index 0000000000000000000000000000000000000000..96cef7defc13924a885e51af123ecd669ceba8d6 --- /dev/null +++ b/docstore/d152c828-96b7-4a98-9ce1-1a6c3719c123 @@ -0,0 +1 @@ +YOUR_URL . Also let me know what needs to taken care of considering weather and commute." , config = GenerateContentConfig ( tools = tools , response_modalities = [ "TEXT" ], ) ) for each in response . candidates [ 0 ] . content . parts : print ( each . text ) # get URLs retrieved for context print ( response . candidates [ 0 ] . url_context_metadata ) Javascript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ "Give me three day events schedule based on YOUR_URL . Also let me know what needs to taken care of considering weather and commute." , ], config : { tools : [{ urlContext : {}}, { googleSearch : {}}], }, }); console . log ( response . text ); // To get URLs retrieved for context console . log ( response . candidates [ 0 ]. urlContextMetadata ) } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -d '{ "contents": [ { "parts": [ {"text": "Give me three day events schedule based on YOUR_URL . Also let me know what needs to taken care of considering weather and commute."} ] } ], "tools": [ { "url_context": {} }, { "google_search": {} } ] }' > result.json cat result.json For more details about Grounding with Google Search, see the overview page. Contextual response The model's response will be based on the content it retrieved from the URLs. If the model retrieved content from URLs, the response will include url_context_metadata . Such a response might look something like the following (parts of the response have been omitted for brevity): { "candidates" : [ { "content" : { "parts" : [ { "text" : "... \n" } ], "role" : "model" }, ... "url_context_metadata" : { "url_metadata" : [ { "retrieved_url" : \ No newline at end of file diff --git a/docstore/d170d1a4-09ea-43c9-92c6-f5817db5dc99 b/docstore/d170d1a4-09ea-43c9-92c6-f5817db5dc99 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/d170d1a4-09ea-43c9-92c6-f5817db5dc99 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/d19a8c6a-2e3d-43f8-8ac1-b04b9ea1f002 b/docstore/d19a8c6a-2e3d-43f8-8ac1-b04b9ea1f002 new file mode 100644 index 0000000000000000000000000000000000000000..e8472db3f1b37b3dc2d13dd06406d0e42fc75dfb --- /dev/null +++ b/docstore/d19a8c6a-2e3d-43f8-8ac1-b04b9ea1f002 @@ -0,0 +1 @@ +costing 258 tokens. Tips and best practices Verify that images are correctly rotated. Use clear, non-blurry images. When using a single image with text, place the text prompt after the image part in the contents array. What's next This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources: Files API : Learn more about uploading and managing files for use with Gemini. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/d1a353a8-afa2-48fb-ae8c-acca22814f03 b/docstore/d1a353a8-afa2-48fb-ae8c-acca22814f03 new file mode 100644 index 0000000000000000000000000000000000000000..ca0d44e9961a5c85cb8aeb7b443a141c9784fb0d --- /dev/null +++ b/docstore/d1a353a8-afa2-48fb-ae8c-acca22814f03 @@ -0,0 +1 @@ +Image understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image understanding Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models. Tip: In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer improved accuracy for specific use cases like object detection and segmentation , through additional training. See the Capabilities section for more details. Passing images to Gemini You can provide images as input to Gemini using two methods: Passing inline image data : Ideal for smaller files (total request size less than 20MB, including prompts). Uploading images using the File API : Recommended for larger files or for reusing images across multiple requests. Passing inline image data You can pass inline image data in the request to generateContent . You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language). The following example shows how to read an image from a local file and pass it to generateContent API for processing. Python from google.genai import types with open ( 'path/to/small-sample.jpg' , 'rb' ) as f : image_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ types . Part . from_bytes ( data = image_bytes , mime_type = 'image/jpeg' , ), 'Caption this image.' ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs \ No newline at end of file diff --git a/docstore/d1a6ae17-4344-412a-9986-7f659c0ece8f b/docstore/d1a6ae17-4344-412a-9986-7f659c0ece8f new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/d1a6ae17-4344-412a-9986-7f659c0ece8f @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/d1ba207f-aab0-4c75-a7bb-a271d120020a b/docstore/d1ba207f-aab0-4c75-a7bb-a271d120020a new file mode 100644 index 0000000000000000000000000000000000000000..a6070457e038ed081cc12ae246b3742e9ebacf0a --- /dev/null +++ b/docstore/d1ba207f-aab0-4c75-a7bb-a271d120020a @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-2.5-flash-preview-tts Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/d1bbb2c5-0fb6-4b72-9baf-74c01fd141ba b/docstore/d1bbb2c5-0fb6-4b72-9baf-74c01fd141ba new file mode 100644 index 0000000000000000000000000000000000000000..8c60a97b59d947e95247d6e4ee3eb21605ab2ae3 --- /dev/null +++ b/docstore/d1bbb2c5-0fb6-4b72-9baf-74c01fd141ba @@ -0,0 +1 @@ +open ( video_file_name , 'rb' ) . read () response = client . models . generate_content ( model = 'models/gemini-2.0-flash' , contents = types . Content ( parts = [ types . Part ( inline_data = types . Blob ( data = video_bytes , mime_type = 'video/mp4' ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const base64VideoFile = fs . readFileSync ( "path/to/small-sample.mp4" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "video/mp4" , data : base64VideoFile , }, }, { text : "Please summarize the video in 3 sentences." } ]; const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : contents , }); console . log ( response . text ); REST Note: If you get an Argument list too long error, the base64 encoding of your file might be too long for the curl command line. Use the File API method instead for larger files. VIDEO_PATH = /path/to/your/video.mp4 if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"video/mp4", "data": "' $( base64 $B64FLAGS $VIDEO_PATH ) '" } }, {"text": "Please summarize the video in 3 sentences."} ] }] }' 2 > /dev/null Include a YouTube URL Preview: The YouTube URL feature is in preview and is available at no charge. Pricing and rate limits are likely to change. The Gemini API and AI Studio support YouTube URLs as a file data Part . You can include a YouTube URL with a prompt asking the model to summarize, translate, or otherwise interact with the video content. Limitations: For the free tier, you can't upload more than 8 hours of \ No newline at end of file diff --git a/docstore/d1c050c1-31cf-4b37-b9e8-64c6069bd929 b/docstore/d1c050c1-31cf-4b37-b9e8-64c6069bd929 new file mode 100644 index 0000000000000000000000000000000000000000..48ebc0d450e476e2d2310fffefae223b737ab72c --- /dev/null +++ b/docstore/d1c050c1-31cf-4b37-b9e8-64c6069bd929 @@ -0,0 +1 @@ +Because standard Gemini API text and content generation calls are stateless, when using thinking in multi-turn interactions (such as chat), the model doesn't have access to thought context from previous turns. You can maintain thought context using thought signatures, which are encrypted representations of the model's internal thought process. The model returns thought signatures in the response object when thinking and function calling are enabled. To ensure the model maintains context across multiple turns of a conversation, you must provide the thought signatures back to the model in the subsequent requests. You will receive thought signatures when: Thinking is enabled and thoughts are generated. The request includes function declarations . Note: Thought signatures are only available when you're using function calling, specifically, your request must include function declarations . You can find an example of thinking with function calls on the Function calling page. Other usage limitations to consider with function calling include: Signatures are returned from the model within other parts in the response, for example function calling or text parts. Return the entire response with all parts back to the model in subsequent turns. Don't concatenate parts with signatures together. Don't merge one part with a signature with another part without a signature. Pricing Note: Summaries are available in the free and paid tiers of the API. Thought signatures will increase the input tokens you are charged when sent back as part of the request. When thinking is turned on, response pricing is the sum of output tokens and thinking tokens. You can get the total number of generated thinking tokens from the thoughtsTokenCount field. Python # ... print ( "Thoughts tokens:" , response . usage_metadata . thoughts_token_count ) print ( "Output tokens:" , response . usage_metadata . candidates_token_count ) JavaScript // ... console . log ( `Thoughts tokens: ${ response . usageMetadata \ No newline at end of file diff --git a/docstore/d1fbfe0b-275a-4526-b4d2-d3b03cf1731f b/docstore/d1fbfe0b-275a-4526-b4d2-d3b03cf1731f new file mode 100644 index 0000000000000000000000000000000000000000..a99dcaa8b13f3b83504ad1b35c796f8fc297615c --- /dev/null +++ b/docstore/d1fbfe0b-275a-4526-b4d2-d3b03cf1731f @@ -0,0 +1 @@ +] # Execute the prompt with specified tools in audio modality await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Multiple tasks example - combining lights, code execution, and search const prompt = ` Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! ` ; const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } // not defined here. ]; // Execute the prompt with specified tools in audio modality await run ( prompt , { tools : tools , modality : "AUDIO" }); Python developers can try this out in the Live API Tool Use notebook . Model context protocol (MCP) Model Context Protocol (MCP) is an open standard for connecting AI applications with external tools and data. MCP provides a common protocol for models to access context, such as functions (tools), data sources (resources), or predefined prompts. The Gemini SDKs have built-in support for the MCP, reducing boilerplate code and offering automatic tool calling for MCP tools. When the model generates an MCP tool call, the Python and JavaScript client SDK can automatically execute the MCP tool and send the response back to the model in a subsequent request, continuing this loop until no more tool calls are made by the model. Here, you can find an example of how to use a local MCP server with Gemini and mcp SDK. Python Make sure the latest version of the mcp SDK is installed on your platform of choice. pip install mcp Note: Python supports automatic tool calling by passing in the ClientSession into the tools parameters. If you want to disable it, you can provide automatic_function_calling with disabled True . import os import asyncio from datetime import datetime from mcp import ClientSession , StdioServerParameters from \ No newline at end of file diff --git a/docstore/d20bfb95-af64-49a8-b38f-0ab26fc50dbb b/docstore/d20bfb95-af64-49a8-b38f-0ab26fc50dbb new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/d20bfb95-af64-49a8-b38f-0ab26fc50dbb @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/d23d8445-6189-451b-bd41-9f1f23df0334 b/docstore/d23d8445-6189-451b-bd41-9f1f23df0334 new file mode 100644 index 0000000000000000000000000000000000000000..38b0d511edf99b6e46520aefe0a8854155d4053f --- /dev/null +++ b/docstore/d23d8445-6189-451b-bd41-9f1f23df0334 @@ -0,0 +1 @@ +energetic music, and dimmed the lights to 50% brightness. Let's get this party started! Compositional function calling Compositional or sequential function calling allows Gemini to chain multiple function calls together to fulfill a complex request. For example, to answer "Get the temperature in my current location", the Gemini API might first invoke a get_current_location() function followed by a get_weather() function that takes the location as a parameter. The following example demonstrates how to implement compositional function calling using the Python SDK and automatic function calling. Python This example uses the automatic function calling feature of the google-genai Python SDK. The SDK automatically converts the Python functions to the required schema, executes the function calls when requested by the model, and sends the results back to the model to complete the task. import os from google import genai from google.genai import types # Example Functions def get_weather_forecast ( location : str ) - > dict : """Gets the current weather temperature for a given location.""" print ( f "Tool Call: get_weather_forecast(location= { location } )" ) # TODO: Make API call print ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ) return { "temperature" : 25 , "unit" : "celsius" } # Dummy response def set_thermostat_temperature ( temperature : int ) - > dict : """Sets the thermostat to a desired temperature.""" print ( f "Tool Call: set_thermostat_temperature(temperature= { temperature } )" ) # TODO: Interact with a thermostat API print ( "Tool Response: {'status': 'success'}" ) return { "status" : "success" } # Configure the client and model client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_weather_forecast , set_thermostat_temperature ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to \ No newline at end of file diff --git a/docstore/d25722ea-4517-437b-abd9-28a5474e75cf b/docstore/d25722ea-4517-437b-abd9-28a5474e75cf new file mode 100644 index 0000000000000000000000000000000000000000..c60a398b68d2fb158c62411b9f70b1da071d4fb4 --- /dev/null +++ b/docstore/d25722ea-4517-437b-abd9-28a5474e75cf @@ -0,0 +1 @@ +response back to the model in a subsequent request, continuing this loop until no more tool calls are made by the model. Here, you can find an example of how to use a local MCP server with Gemini and mcp SDK. Python Make sure the latest version of the mcp SDK is installed on your platform of choice. pip install mcp Note: Python supports automatic tool calling by passing in the ClientSession into the tools parameters. If you want to disable it, you can provide automatic_function_calling with disabled True . import os import asyncio from datetime import datetime from mcp import ClientSession , StdioServerParameters from mcp.client.stdio import stdio_client from google import genai client = genai . Client () # Create server parameters for stdio connection server_params = StdioServerParameters ( command = "npx" , # Executable args = [ "-y" , "@philschmid/weather-mcp" ], # MCP Server env = None , # Optional environment variables ) async def run (): async with stdio_client ( server_params ) as ( read , write ): async with ClientSession ( read , write ) as session : # Prompt to get the weather for the current day in London. prompt = f "What is the weather in London in { datetime . now () . strftime ( '%Y-%m- %d ' ) } ?" # Initialize the connection between client and server await session . initialize () # Send request to the model with MCP function declarations response = await client . aio . models . generate_content ( model = "gemini-2.5-flash" , contents = prompt , config = genai . types . GenerateContentConfig ( temperature = 0 , tools = [ session ], # uses the session, will automatically call the tool # Uncomment if you **don't** want the SDK to automatically call the tool # automatic_function_calling=genai.types.AutomaticFunctionCallingConfig( # disable=True # ), ), ) print ( response . text ) # Start the asyncio event loop and run the main function asyncio . run ( run ()) JavaScript Make sure the latest version of the mcp SDK is installed on your platform of choice. \ No newline at end of file diff --git a/docstore/d25e2105-ef07-423d-9ab4-abeb19e7759c b/docstore/d25e2105-ef07-423d-9ab4-abeb19e7759c new file mode 100644 index 0000000000000000000000000000000000000000..696658722d20a37964165e2b0fc6d9de79b0d2b8 --- /dev/null +++ b/docstore/d25e2105-ef07-423d-9ab4-abeb19e7759c @@ -0,0 +1 @@ +main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); You can enable transcription of the audio input by sending input_audio_transcription in setup config. Python import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ], "input_audio_transcription" : {}, } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_data = Path ( "16000.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_data , mime_type = 'audio/pcm;rate=16000' ) ) async for msg in session . receive (): if msg . server_content . input_transcription : print ( 'Transcript:' , msg . server_content . input_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ], inputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( \ No newline at end of file diff --git a/docstore/d25e67f4-a4e7-4258-9e71-19774a4921e3 b/docstore/d25e67f4-a4e7-4258-9e71-19774a4921e3 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/d25e67f4-a4e7-4258-9e71-19774a4921e3 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/d27d5517-8d50-4596-9bd2-2c8ebb61f612 b/docstore/d27d5517-8d50-4596-9bd2-2c8ebb61f612 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/d27d5517-8d50-4596-9bd2-2c8ebb61f612 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/d28820dc-f944-46b5-9ee8-fbaed8b1838d b/docstore/d28820dc-f944-46b5-9ee8-fbaed8b1838d new file mode 100644 index 0000000000000000000000000000000000000000..71a40ddf8f5869779c79b3a864f86f411dc17d60 --- /dev/null +++ b/docstore/d28820dc-f944-46b5-9ee8-fbaed8b1838d @@ -0,0 +1 @@ +correct reasoning steps afterward. To disambiguate between those reasons, ask the model to describe what's in the image. In the following example, if the model responds with a snack that seems surprising when paired with tea (e.g. popcorn), you can first troubleshoot to determine whether the model correctly recognized that the image contains tea. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? Describe what's in this image. Another strategy is to ask the model to explain its reasoning. That can help you narrow down which part of the reasoning broke down, if any. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? What's a snack I can make in 1 minute that would go well with this? Please explain why. What's next Try writing your own multimodal prompts using Google AI Studio . For information on using the Gemini Files API for uploading media files and including them in your prompts, see the Vision , Audio , and Document processing guides. For more guidance on prompt design, like tuning sampling parameters, see the Prompt strategies page. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/d2935a36-1ab7-4cd3-9efa-e1addc21a4c5 b/docstore/d2935a36-1ab7-4cd3-9efa-e1addc21a4c5 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/d2935a36-1ab7-4cd3-9efa-e1addc21a4c5 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/d2c12c05-1d8a-4e6b-a249-14278dd29d80 b/docstore/d2c12c05-1d8a-4e6b-a249-14278dd29d80 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/d2c12c05-1d8a-4e6b-a249-14278dd29d80 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/d2cda560-ae2d-437c-9fcd-aec2f75a9717 b/docstore/d2cda560-ae2d-437c-9fcd-aec2f75a9717 new file mode 100644 index 0000000000000000000000000000000000000000..3c1b7a7f28e24faa230d65b7608a5733d7c64407 --- /dev/null +++ b/docstore/d2cda560-ae2d-437c-9fcd-aec2f75a9717 @@ -0,0 +1 @@ +single prompt by including multiple image Part objects in the contents array. These can be a mix of inline data (local files or URLs) and File API references. Python from google import genai from google.genai import types client = genai . Client () # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client . files . upload ( file = image1_path ) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open ( image2_path , 'rb' ) as f : img2_bytes = f . read () # Create the prompt with text and multiple images response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is different between these two images?" , uploaded_file , # Use the uploaded file reference types . Part . from_bytes ( data = img2_bytes , mime_type = 'image/png' ) ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); async function main () { // Upload the first image const image1_path = "path/to/image1.jpg" ; const uploadedFile = await ai . files . upload ({ file : image1_path , config : { mimeType : "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png" ; const base64Image2File = fs . readFileSync ( image2_path , { encoding : "base64" , }); // Create the prompt with text and multiple images const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ "What is different between these two images?" , createPartFromUri ( uploadedFile . uri , uploadedFile . mimeType ), { inlineData : { mimeType : "image/png" , data : base64Image2File , }, }, ]), }); console . log ( response . text ); } await main (); Go // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile , _ := client . Files . UploadFromPath ( ctx , image1Path , nil ) // Prepare the second image as inline \ No newline at end of file diff --git a/docstore/d2d42222-42d2-4a9c-b108-8c71f0baa8a7 b/docstore/d2d42222-42d2-4a9c-b108-8c71f0baa8a7 new file mode 100644 index 0000000000000000000000000000000000000000..38b0d511edf99b6e46520aefe0a8854155d4053f --- /dev/null +++ b/docstore/d2d42222-42d2-4a9c-b108-8c71f0baa8a7 @@ -0,0 +1 @@ +energetic music, and dimmed the lights to 50% brightness. Let's get this party started! Compositional function calling Compositional or sequential function calling allows Gemini to chain multiple function calls together to fulfill a complex request. For example, to answer "Get the temperature in my current location", the Gemini API might first invoke a get_current_location() function followed by a get_weather() function that takes the location as a parameter. The following example demonstrates how to implement compositional function calling using the Python SDK and automatic function calling. Python This example uses the automatic function calling feature of the google-genai Python SDK. The SDK automatically converts the Python functions to the required schema, executes the function calls when requested by the model, and sends the results back to the model to complete the task. import os from google import genai from google.genai import types # Example Functions def get_weather_forecast ( location : str ) - > dict : """Gets the current weather temperature for a given location.""" print ( f "Tool Call: get_weather_forecast(location= { location } )" ) # TODO: Make API call print ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ) return { "temperature" : 25 , "unit" : "celsius" } # Dummy response def set_thermostat_temperature ( temperature : int ) - > dict : """Sets the thermostat to a desired temperature.""" print ( f "Tool Call: set_thermostat_temperature(temperature= { temperature } )" ) # TODO: Interact with a thermostat API print ( "Tool Response: {'status': 'success'}" ) return { "status" : "success" } # Configure the client and model client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_weather_forecast , set_thermostat_temperature ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to \ No newline at end of file diff --git a/docstore/d2d44c61-7bde-4cb6-b8b9-af8c55c32f15 b/docstore/d2d44c61-7bde-4cb6-b8b9-af8c55c32f15 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/d2d44c61-7bde-4cb6-b8b9-af8c55c32f15 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/d31dc1eb-8f26-4ec4-a069-20d43b60a741 b/docstore/d31dc1eb-8f26-4ec4-a069-20d43b60a741 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/d31dc1eb-8f26-4ec4-a069-20d43b60a741 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/d320cc1f-cb43-46a3-a55a-5b29ac3ef7ad b/docstore/d320cc1f-cb43-46a3-a55a-5b29ac3ef7ad new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/d320cc1f-cb43-46a3-a55a-5b29ac3ef7ad @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/d322787b-e8d2-4508-847a-0d4b6f841d1a b/docstore/d322787b-e8d2-4508-847a-0d4b6f841d1a new file mode 100644 index 0000000000000000000000000000000000000000..964387d039ef80cfbc57b203b39f125290e566d4 --- /dev/null +++ b/docstore/d322787b-e8d2-4508-847a-0d4b6f841d1a @@ -0,0 +1 @@ +using the text below. Respond with only the text provided. Question: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Text: Color: Slowly pulsing yellow What it means: There is a network error. What to do: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. Color: Fast blinking yellow What it means: You are holding down the reset button and are factory resetting this device. What to do: If you keep holding down the reset button, after about 12 seconds, the light will turn solid yellow. Once it is solid yellow, let go of the factory reset button. Color: Solid yellow What it means: Router is factory resetting. What to do: This can take up to 10 minutes. When it's done, the device will reset itself and start pulsing white, letting you know it's ready for setup. Color: Solid red What it means: Something is wrong. What to do: Critical failure. Factory reset the router. If the light stays red, contact Wifi customer support. Response: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. (gemini-2.5-flash) Add prefixes A prefix is a word or phrase that you add to the prompt content that can serve several purposes, depending on where you put the prefix: Input prefix: Adding a prefix to the input signals semantically meaningful parts of the input to the model. For example, the prefixes "English:" and "French:" demarcate two different languages. Output prefix: Even though the output is generated by the model, you can add a prefix for the output in the prompt. The output prefix gives the model information about what's expected as a response. For example, the output prefix "JSON:" signals to the model that the output should be in JSON format. Example prefix: In few-shot prompts, adding prefixes \ No newline at end of file diff --git a/docstore/d325ffa1-c2cd-40d5-a71d-c231c519bc6b b/docstore/d325ffa1-c2cd-40d5-a71d-c231c519bc6b new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/d325ffa1-c2cd-40d5-a71d-c231c519bc6b @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/d3392a92-0fc0-408b-9c8e-afc0c8fee391 b/docstore/d3392a92-0fc0-408b-9c8e-afc0c8fee391 new file mode 100644 index 0000000000000000000000000000000000000000..7e8ad5dab4abf30afa5f2a7a1bb2bb58fcb2f5d0 --- /dev/null +++ b/docstore/d3392a92-0fc0-408b-9c8e-afc0c8fee391 @@ -0,0 +1 @@ +-X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"' " $MIME_TYPE " '", "data": "' " $IMAGE_B64 " '" } }, {"text": "Caption this image."} ] }] }' 2 > /dev/null Note: Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly. Uploading images using the File API For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to generateContent . See the Files API guide for more information and examples. Python from google import genai client = genai . Client () my_file = client . files . upload ( file = "path/to/sample.jpg" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ my_file , "Caption this image." ], ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.jpg" , config : { mimeType : "image/jpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Caption this image." , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.jpg" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Caption this image." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * \ No newline at end of file diff --git a/docstore/d3620418-f3d1-4ec5-a4f8-e174a0d47813 b/docstore/d3620418-f3d1-4ec5-a4f8-e174a0d47813 new file mode 100644 index 0000000000000000000000000000000000000000..37fa730aa9280f3cac34df0c8f8ecdd2b308e691 --- /dev/null +++ b/docstore/d3620418-f3d1-4ec5-a4f8-e174a0d47813 @@ -0,0 +1 @@ +operation . response . generated_videos ): client . files . download ( file = generated_video . video ) generated_video . video . save ( f "video { n } .mp4" ) # save the video JavaScript import { GoogleGenAI } from "@google/genai" ; import { createWriteStream } from "fs" ; import { Readable } from "stream" ; const ai = new GoogleGenAI ({}); async function main () { let operation = await ai . models . generateVideos ({ model : "veo-2.0-generate-001" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , config : { personGeneration : "dont_allow" , aspectRatio : "16:9" , }, }); while ( ! operation . done ) { await new Promise (( resolve ) = > setTimeout ( resolve , 10000 )); operation = await ai . operations . getVideosOperation ({ operation : operation , }); } operation . response ? . generatedVideos ? . forEach ( async ( generatedVideo , n ) = > { const resp = await fetch ( ` ${ generatedVideo . video ? . uri } & key=GEMINI_API_KEY` ); // append your API key const writer = createWriteStream ( `video ${ n } .mp4` ); Readable . fromWeb ( resp . body ). pipe ( writer ); }); } main (); Go package main import ( "context" "fmt" "os" "time" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } videoConfig := & genai . GenerateVideosConfig { AspectRatio : "16:9" , PersonGeneration : "dont_allow" , } operation , _ := client . Models . GenerateVideos ( ctx , "veo-2.0-generate-001" , "Panning wide shot of a calico kitten sleeping in the sunshine" , nil , videoConfig , ) for ! operation . Done { time . Sleep ( 20 * time . Second ) operation , _ = client . Operations . GetVideosOperation ( ctx , operation , nil ) } for n , video := range operation . Response . GeneratedVideos { client . Files . Download ( ctx , video . Video , nil ) fname := fmt . Sprintf ( "video_%d.mp4" , n ) _ = os . WriteFile ( fname , video . Video . VideoBytes , 0644 ) } } REST # \ No newline at end of file diff --git a/docstore/d38cb7d3-2c18-4cd3-9438-2f7d8bc89e3d b/docstore/d38cb7d3-2c18-4cd3-9438-2f7d8bc89e3d new file mode 100644 index 0000000000000000000000000000000000000000..eeaa745b8119787addf02809d9d1b660f835f8e5 --- /dev/null +++ b/docstore/d38cb7d3-2c18-4cd3-9438-2f7d8bc89e3d @@ -0,0 +1 @@ +Function calling with the Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Function calling with the Gemini API Function calling lets you connect models to external tools and APIs. Instead of generating text responses, the model determines when to call specific functions and provides the necessary parameters to execute real-world actions. This allows the model to act as a bridge between natural language and real-world actions and data. Function calling has 3 primary use cases: Augment Knowledge: Access information from external sources like databases, APIs, and knowledge bases. Extend Capabilities: Use external tools to perform computations and extend the limitations of the model, such as using a calculator or creating charts. Take Actions: Interact with external systems using APIs, such as scheduling appointments, creating invoices, sending emails, or controlling smart home devices. Get Weather Schedule Meeting Create Chart Python from google import genai from google.genai import types # Define the function declaration for the model weather_function = { "name" : "get_current_temperature" , "description" : "Gets the current temperature for a given location." , "parameters" : { "type" : "object" , "properties" : { "location" : { "type" : "string" , "description" : "The city name, e.g. San Francisco" , }, }, "required" : [ "location" ], }, } # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ weather_function ]) config = types . GenerateContentConfig ( tools = [ tools ]) # Send request with function declarations response = client . models . generate_content ( model = \ No newline at end of file diff --git a/docstore/d390ea83-32d5-49df-9b27-5350360da869 b/docstore/d390ea83-32d5-49df-9b27-5350360da869 new file mode 100644 index 0000000000000000000000000000000000000000..a3f65441166ab1d39d6f723c5ce4c04b54ac95e6 --- /dev/null +++ b/docstore/d390ea83-32d5-49df-9b27-5350360da869 @@ -0,0 +1 @@ +values, use an enum. Tool Selection: While the model can use an arbitrary number of tools, providing too many can increase the risk of selecting an incorrect or suboptimal tool. For best results, aim to provide only the relevant tools for the context or task, ideally keeping the active set to a maximum of 10-20. Consider dynamic tool selection based on conversation context if you have a large total number of tools. Prompt Engineering: Provide context: Tell the model its role (e.g., "You are a helpful weather assistant."). Give instructions: Specify how and when to use functions (e.g., "Don't guess dates; always use a future date for forecasts."). Encourage clarification: Instruct the model to ask clarifying questions if needed. Temperature: Use a low temperature (e.g., 0) for more deterministic and reliable function calls. Validation: If a function call has significant consequences (e.g., placing an order), validate the call with the user before executing it. Error Handling : Implement robust error handling in your functions to gracefully handle unexpected inputs or API failures. Return informative error messages that the model can use to generate helpful responses to the user. Security: Be mindful of security when calling external APIs. Use appropriate authentication and authorization mechanisms. Avoid exposing sensitive data in function calls. Token Limits: Function descriptions and parameters count towards your input token limit. If you're hitting token limits, consider limiting the number of functions or the length of the descriptions, break down complex tasks into smaller, more focused function sets. Notes and limitations Only a subset of the OpenAPI schema is supported. Supported parameter types in Python are limited. Automatic function calling is a Python SDK feature only. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License \ No newline at end of file diff --git a/docstore/d3972ad9-5695-4728-b487-0ce222201e8b b/docstore/d3972ad9-5695-4728-b487-0ce222201e8b new file mode 100644 index 0000000000000000000000000000000000000000..c19e2d1f2a8624bd1e7529d33c658d3154e40942 --- /dev/null +++ b/docstore/d3972ad9-5695-4728-b487-0ce222201e8b @@ -0,0 +1 @@ +response. A token is approximately four characters. 100 tokens correspond to roughly 60-80 words. Temperature: The temperature controls the degree of randomness in token selection. The temperature is used for sampling during response generation, which occurs when topP and topK are applied. Lower temperatures are good for prompts that require a more deterministic or less open-ended response, while higher temperatures can lead to more diverse or creative results. A temperature of 0 is deterministic, meaning that the highest probability response is always selected. topK : The topK parameter changes how the model selects tokens for output. A topK of 1 means the selected token is the most probable among all the tokens in the model's vocabulary (also called greedy decoding), while a topK of 3 means that the next token is selected from among the 3 most probable using the temperature. For each token selection step, the topK tokens with the highest probabilities are sampled. Tokens are then further filtered based on topP with the final token selected using temperature sampling. topP : The topP parameter changes how the model selects tokens for output. Tokens are selected from the most to least probable until the sum of their probabilities equals the topP value. For example, if tokens A, B, and C have a probability of 0.3, 0.2, and 0.1 and the topP value is 0.5, then the model will select either A or B as the next token by using the temperature and exclude C as a candidate. The default topP value is 0.95. stop_sequences : Set a stop sequence to tell the model to stop generating content. A stop sequence can be any sequence of characters. Try to avoid using a sequence of characters that may appear in the generated content. Prompt iteration strategies Prompt design can sometimes require a few iterations before you consistently get the response you're looking for. This section provides guidance on some things you can try when iterating on your prompts: Use different phrasing: \ No newline at end of file diff --git a/docstore/d4327783-6af9-49e0-b0ba-35e37202e132 b/docstore/d4327783-6af9-49e0-b0ba-35e37202e132 new file mode 100644 index 0000000000000000000000000000000000000000..a6b24f2239a055f1d65a9fc8ecf3b6c3213634db --- /dev/null +++ b/docstore/d4327783-6af9-49e0-b0ba-35e37202e132 @@ -0,0 +1 @@ +min | 12/2/2023 | 12/4/2023 | | Physics | In progress | 2 hrs | 12/3/2023 | 12/5/2023 | | Biology | Not started | 6 hrs | 11/2/2016 | 12/6/2023 | Prompt Model response Provide a list of all the following attributes: ingredients, type of cuisine, vegetarian or not, in JSON format ```json { "ingredients": [ "rice", "shrimp", "clams", "mussels", "peas", "tomatoes", "onions", "garlic", "olive oil", "paprika", "salt", "pepper" ], "type of cuisine": "Spanish", "vegetarian": false } ``` Put your image first for single-image prompts While Gemini can interpret image and text in any order within a prompt, placing a single image before the text prompt might lead to better results. In the following samples, you'll notice the image comes first before the text in each prompt. Troubleshooting your multimodal prompt You might need to troubleshoot your prompt if you are not getting a helpful response. Here are a few strategies you could try. If the model is not drawing information from the relevant part of the image To get a more specific response, you can point out which aspects of the image you want the prompt to draw information from to inform its response. Prompt Model response How many days will these diapers last a baby? The diapers will last for 198 days before they run out. Updated prompt Improved response How long will these diapers last before I run out? Use the weight shown on the box to determine the child's age, and use the total number of diapers in the box. Divide the total number by how many diapers the child goes through per day. The box contains 198 diapers. The box states the diapers are for a child weighing 22-37 lbs. The average child in this weight range is likely between 1 and 2 years old. A 2-year-old typically goes through 6 diapers per day, therefore, the diapers will last around 33 days (198/6). If the model output is too generic and not tailored enough to the image input To help the model tailor its response to the image(s), try asking it to describe the \ No newline at end of file diff --git a/docstore/d43c971e-b418-4a4d-bb11-c02dae096bcd b/docstore/d43c971e-b418-4a4d-bb11-c02dae096bcd new file mode 100644 index 0000000000000000000000000000000000000000..2e8ac78d46cd543ec267658a40dc30e0a2feb077 --- /dev/null +++ b/docstore/d43c971e-b418-4a4d-bb11-c02dae096bcd @@ -0,0 +1 @@ +Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { SystemInstruction : genai . NewContentFromText ( "You are a cat. Your name is Neko." , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Hello there" ), config , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "system_instruction": { "parts": [ { "text": "You are a cat. Your name is Neko." } ] }, "contents": [ { "parts": [ { "text": "Hello there" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const systemInstruction = { parts : [{ text : 'You are a cat. Your name is Neko.' }] }; const payload = { systemInstruction , contents : [ { parts : [ { text : 'Hello there' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } The GenerateContentConfig object also lets you override default generation parameters, such as temperature . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model \ No newline at end of file diff --git a/docstore/d4441f0a-5f86-4ca0-b437-b221e3358bb7 b/docstore/d4441f0a-5f86-4ca0-b437-b221e3358bb7 new file mode 100644 index 0000000000000000000000000000000000000000..c1215478fcfc0a791e03023378a8264733510dd8 --- /dev/null +++ b/docstore/d4441f0a-5f86-4ca0-b437-b221e3358bb7 @@ -0,0 +1 @@ +connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.pcm" ); const base64Audio = Buffer . from ( fileBuffer ). toString ( 'base64' ); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); // if stream gets paused, send: // session.sendRealtimeInput({ audioStreamEnd: true }) const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); With send_realtime_input , the API will respond to audio automatically based on VAD. While send_client_content adds messages to the model context in order, send_realtime_input is optimized for responsiveness at the expense of deterministic ordering. Automatic VAD configuration For more control over the VAD activity, you can configure the following parameters. See API reference for more info. Python from google.genai import types config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : False , # default "start_of_speech_sensitivity" : types . StartSensitivity . START_SENSITIVITY_LOW , "end_of_speech_sensitivity" : types . EndSensitivity . END_SENSITIVITY_LOW , "prefix_padding_ms" : 20 , "silence_duration_ms" : 100 , } } } JavaScript import { GoogleGenAI , Modality , StartSensitivity , EndSensitivity } from '@google/genai' ; const config = { responseModalities : [ Modality . \ No newline at end of file diff --git a/docstore/d44b9d54-8748-423c-8058-91851889db93 b/docstore/d44b9d54-8748-423c-8058-91851889db93 new file mode 100644 index 0000000000000000000000000000000000000000..5c08e584262d5f95532435a85f2ee4b9a401ebd8 --- /dev/null +++ b/docstore/d44b9d54-8748-423c-8058-91851889db93 @@ -0,0 +1 @@ +'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } What's next Now that you made your first API request, you might want to explore the following guides that show Gemini in action: Thinking Text generation Vision Long context Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/d44be512-0575-479c-9951-94e1fc48b4b5 b/docstore/d44be512-0575-479c-9951-94e1fc48b4b5 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/d44be512-0575-479c-9951-94e1fc48b4b5 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/d49900f2-609f-49d1-9710-c954d9bbec27 b/docstore/d49900f2-609f-49d1-9710-c954d9bbec27 new file mode 100644 index 0000000000000000000000000000000000000000..eb6db224edbdd160f04cb946308fd82587e98eec --- /dev/null +++ b/docstore/d49900f2-609f-49d1-9710-c954d9bbec27 @@ -0,0 +1 @@ +marks Spain's record-breaking fourth European Championship title.[5]((https:/...), [2](https:/...), [3](https:/...), [4](https:/...) Pricing When you use Grounding with Google Search, your project is billed per API request that includes the google_search tool. If the model decides to execute multiple search queries to answer a single prompt (for example, searching for "UEFA Euro 2024 winner" and "Spain vs England Euro 2024 final score" within the same API call), this counts as a single billable use of the tool for that request. For detailed pricing information, see the Gemini API pricing page . Supported Models Experimental and Preview models are not included. You can find their capabilities on the model overview page. Model Grounding with Google Search Gemini 2.5 Pro ✔️ Gemini 2.5 Flash ✔️ Gemini 2.0 Flash ✔️ Gemini 1.5 Pro ✔️ Gemini 1.5 Flash ✔️ Note: Older models use a google_search_retrieval tool. For all current models, use the google_search tool as shown in the examples. Grounding with Gemini 1.5 Models (Legacy) While the google_search tool is recommended for Gemini 2.0 and later, Gemini 1.5 support a legacy tool named google_search_retrieval . This tool provides a dynamic mode that allows the model to decide whether to perform a search based on its confidence that the prompt requires fresh information. If the model's confidence is above a dynamic_threshold you set (a value between 0.0 and 1.0), it will perform a search. Python # Note: This is a legacy approach for Gemini 1.5 models. # The 'google_search' tool is recommended for all new development. import os from google import genai from google.genai import types client = genai . Client () retrieval_tool = types . Tool ( google_search_retrieval = types . GoogleSearchRetrieval ( dynamic_retrieval_config = types . DynamicRetrievalConfig ( mode = types . DynamicRetrievalConfigMode . MODE_DYNAMIC , dynamic_threshold = 0.7 # Only search if confidence > 70% ) ) ) config = types . GenerateContentConfig ( tools = [ \ No newline at end of file diff --git a/docstore/d49947fb-71e2-488c-bebc-1bd90c664ccb b/docstore/d49947fb-71e2-488c-bebc-1bd90c664ccb new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/d49947fb-71e2-488c-bebc-1bd90c664ccb @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/d4baf4e1-d016-4e1e-9f99-7b967de924d4 b/docstore/d4baf4e1-d016-4e1e-9f99-7b967de924d4 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/d4baf4e1-d016-4e1e-9f99-7b967de924d4 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/d4eb733f-6ede-4518-940c-bfc2327d5039 b/docstore/d4eb733f-6ede-4518-940c-bfc2327d5039 new file mode 100644 index 0000000000000000000000000000000000000000..c0f76dabad92f04a7ec66315fe09f497cfdb8cee --- /dev/null +++ b/docstore/d4eb733f-6ede-4518-940c-bfc2327d5039 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/image-generation Title: Image generation | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/d4f5b2d7-2e46-498a-805c-1806548830a0 b/docstore/d4f5b2d7-2e46-498a-805c-1806548830a0 new file mode 100644 index 0000000000000000000000000000000000000000..65512c01f777d6e5e3b37a55514cc97b01015837 --- /dev/null +++ b/docstore/d4f5b2d7-2e46-498a-805c-1806548830a0 @@ -0,0 +1 @@ +Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Affective dialog This feature lets Gemini adapt its response style to the input expression and tone. To use affective dialog, set the api version to v1alpha and set enable_affective_dialog to true in the setup message: Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], enable_affective_dialog = True ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], enableAffectiveDialog : true }; Note that affective dialog is currently only supported by the native audio output models. Proactive audio When this feature is enabled, Gemini can proactively decide not to respond if the content is not relevant. To use it, set the api version to v1alpha and configure the proactivity field in the setup message and set proactive_audio to true : Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], proactivity = { 'proactive_audio' : True } ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], proactivity : { proactiveAudio : true } } Note that proactive audio is currently only supported by the native audio output models. Native audio output with thinking Native audio output supports thinking capabilities , available via a separate model gemini-2.5-flash-exp-native-audio-thinking-dialog . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-exp-native-audio-thinking-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio \ No newline at end of file diff --git a/docstore/d50ddbbc-33c4-409c-a7ce-1d2e8cf41fdf b/docstore/d50ddbbc-33c4-409c-a7ce-1d2e8cf41fdf new file mode 100644 index 0000000000000000000000000000000000000000..84fe47fad9b0a25663e2cf29b8b19192b98d3175 --- /dev/null +++ b/docstore/d50ddbbc-33c4-409c-a7ce-1d2e8cf41fdf @@ -0,0 +1 @@ +Available regions for Google AI Studio and Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Available regions for Google AI Studio and Gemini API If you reached this page after trying to open Google AI Studio , it may be because Google AI Studio is not available in your region, or you don't meet the age requirements (18+) for access. You can learn more about the available regions in the following section and other requirements in the terms of service . Available regions Note: For Colab users - Region restrictions are applied based on the region that the Colab instance is in, not the region that the user is in. You can check the location of the Colab instance using !curl ipinfo.io The Gemini API and Google AI Studio are available in the following countries and territories. If you're not in one of these countries or territories, try the Gemini API in Vertex AI : Albania Algeria American Samoa Angola Anguilla Antarctica Antigua and Barbuda Argentina Armenia Aruba Australia Austria Azerbaijan The Bahamas Bahrain Bangladesh Barbados Belgium Belize Benin Bermuda Bhutan Bolivia Bosnia Botswana Brazil British Indian Ocean Territory British Virgin Islands Brunei Bulgaria Burkina Faso Burundi Cabo Verde Cambodia Cameroon Canada Caribbean Netherlands Cayman Islands Central African Republic Chad Chile Christmas Island Cocos (Keeling) Islands Colombia Comoros Cook Islands Côte d'Ivoire Costa Rica Croatia Curaçao Czech Republic Democratic Republic of the Congo Denmark Djibouti Dominica Dominican Republic Ecuador Egypt El Salvador Equatorial Guinea Eritrea Estonia Eswatini Ethiopia Falkland Islands (Islas Malvinas) Faroe Islands Fiji Finland France \ No newline at end of file diff --git a/docstore/d50f05f7-b612-4d6d-a665-f76a79d22aeb b/docstore/d50f05f7-b612-4d6d-a665-f76a79d22aeb new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/d50f05f7-b612-4d6d-a665-f76a79d22aeb @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/d519483e-8990-41ac-847b-1302e90f5785 b/docstore/d519483e-8990-41ac-847b-1302e90f5785 new file mode 100644 index 0000000000000000000000000000000000000000..433635003046509e85b7917fbaa1cad75744aec9 --- /dev/null +++ b/docstore/d519483e-8990-41ac-847b-1302e90f5785 @@ -0,0 +1 @@ +GenerateContentRequest inline_requests = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Tell me a one-sentence joke.' }], 'role' : 'user' }] }, { 'contents' : [{ 'parts' : [{ 'text' : 'Why is the sky blue?' }], 'role' : 'user' }] } ] inline_batch_job = client . batches . create ( model = "models/gemini-2.5-flash" , src = inline_requests , config = { 'display_name' : "inlined-requests-job-1" , }, ) print ( f "Created batch job: { inline_batch_job . name } " ) REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:batchGenerateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -X POST \ -H "Content-Type:application/json" \ -d '{ "batch": { "display_name": "my-batch-requests", "input_config": { "requests": { "requests": [ { "request": {"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}]}, "metadata": { "key": "request-1" } }, { "request": {"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}]}, "metadata": { "key": "request-2" } } ] } } } }' You can use any requests you would use in non-batch (or interactive) mode. For example, you could specify the temperature, system instructions or even pass in other modalities. The following example shows some example inline requests that contain a system instruction for one of the requests: inline_requests_list = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Write a short poem about a cloud.' }]}]}, { 'contents' : [{ 'parts' : [{ 'text' : 'Write a short poem about a cat.' }]}], 'system_instructions' : { 'parts' : [{ 'text' : 'You are a cat. Your name is Neko.' }]}} ] Similarly can also specify tools to use for a request. The following example shows a request that enables the Google Search tool : inline_requests_list = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Who won the euro 1998?' }]}]}, { 'contents' : [{ 'parts' : [{ 'text' : 'Who won the euro 2025?' }]}], 'tools' : [{ 'google_search ' : {}}]} ] Input file For larger sets of requests, prepare a JSON Lines \ No newline at end of file diff --git a/docstore/d5318fd4-79b7-4def-9651-38c81124a358 b/docstore/d5318fd4-79b7-4def-9651-38c81124a358 new file mode 100644 index 0000000000000000000000000000000000000000..6adba52776c85866a9f7e3e3060c0512ed2f447c --- /dev/null +++ b/docstore/d5318fd4-79b7-4def-9651-38c81124a358 @@ -0,0 +1 @@ +about image prompting, see the Imagen prompt guide To learn about video prompting, see the Veo prompt guide Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-04-28 UTC. \ No newline at end of file diff --git a/docstore/d53fe004-7daa-4bf5-b419-42fdad5d8591 b/docstore/d53fe004-7daa-4bf5-b419-42fdad5d8591 new file mode 100644 index 0000000000000000000000000000000000000000..5a67c041917cdaf904b0e03794a07af474503a9a --- /dev/null +++ b/docstore/d53fe004-7daa-4bf5-b419-42fdad5d8591 @@ -0,0 +1 @@ +upload..." curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D ${ tmp_header_file } \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " echo "Uploading video data..." curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ VIDEO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri echo "File uploaded successfully. File URI: ${ file_uri } " # --- 3. Generate content using the uploaded video file --- echo "Generating content from video..." curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Summarize this video. Then create a quiz with an answer key based on the information in this video."}] }] }' 2 > /dev/null > response.json jq -r ".candidates[].content.parts[].text" response.json To learn more about working with media files, see Files API . Pass video data inline Instead of uploading a video file using the File API, you can pass smaller videos directly in the request to generateContent . This is suitable for shorter videos under 20MB total request size. Here's an example of providing inline video data: Python # Only for videos of size <20Mb video_file_name = "/path/to/your/video.mp4" video_bytes = \ No newline at end of file diff --git a/docstore/d55ea83c-e8db-45f4-9e31-a00e14cd7503 b/docstore/d55ea83c-e8db-45f4-9e31-a00e14cd7503 new file mode 100644 index 0000000000000000000000000000000000000000..17bfa7cb7ce514bf0ade86c26f1bff30fbb20a2e --- /dev/null +++ b/docstore/d55ea83c-e8db-45f4-9e31-a00e14cd7503 @@ -0,0 +1 @@ +model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Describe this audio clip" , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } localAudioPath := "/path/to/sample.mp3" uploadedFile , _ := client . Files . UploadFromPath ( ctx , localAudioPath , nil , ) parts := [] * genai . Part { genai . NewPartFromText ( "Describe this audio clip" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST AUDIO_PATH = "path/to/sample.mp3" MIME_TYPE = $( file -b --mime-type " ${ AUDIO_PATH } " ) NUM_BYTES = $( wc -c < " ${ AUDIO_PATH } " ) DISPLAY_NAME = AUDIO tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ \ No newline at end of file diff --git a/docstore/d57cb834-61f5-4884-b27b-113767ffeb44 b/docstore/d57cb834-61f5-4884-b27b-113767ffeb44 new file mode 100644 index 0000000000000000000000000000000000000000..46b1ab716068a90ca8b9aaaffe42e5334bcea2c0 --- /dev/null +++ b/docstore/d57cb834-61f5-4884-b27b-113767ffeb44 @@ -0,0 +1 @@ +Batch Mode | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Batch Mode The Gemini API's Batch Mode is designed to process large volumes of requests asynchronously at 50% of the standard cost . The target turnaround time is 24 hours, but in majority of cases, it is much quicker. Use Batch Mode for large-scale, non-urgent tasks such as data pre-processing or running evaluations where an immediate response is not required. Note: You can use Batch Mode with the Gemini API Python SDK or the REST API. Support for Batch Mode in the Gemini API JavaScript SDK is coming soon. Getting Started This section helps you get started with submitting your first requests in batch mode. Creating a batch job You have two ways to submit your requests in Batch Mode: Inline Requests : A list of GenerateContentRequest objects directly included in your batch creation request. This is suitable for smaller batches that keep the total request size under 20MB. The output returned from the model is a list of inlineResponse objects. Input File : A JSON Lines (JSONL) file where each line contains a complete GenerateContentRequest object. This method is recommended for larger requests. The output returned from the model is a JSONL file where each line is either a GenerateContentResponse or a status object. Inline requests For a small number of requests, you can directly embed the GenerateContentRequest objects within your BatchGenerateContentRequest . The following example calls the BatchGenerateContent method with inline requests: Python from google import genai from google.genai import types client = genai . Client () # A list of dictionaries, where each is a \ No newline at end of file diff --git a/docstore/d57d2db9-66e4-4f6c-811f-52d0de5ec582 b/docstore/d57d2db9-66e4-4f6c-811f-52d0de5ec582 new file mode 100644 index 0000000000000000000000000000000000000000..43d235dee43f472c269052714c9705874463b9cd --- /dev/null +++ b/docstore/d57d2db9-66e4-4f6c-811f-52d0de5ec582 @@ -0,0 +1 @@ +tools ]) # Define user prompt contents = [ types . Content ( role = "user" , parts = [ types . Part ( text = "Turn the lights down to a romantic level" )] ) ] # Send request with function declarations response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = contents config = config , ) print ( response . candidates [ 0 ] . content . parts [ 0 ] . function_call ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Generation config with function declaration const config = { tools : [{ functionDeclarations : [ setLightValuesFunctionDeclaration ] }] }; // Configure the client const ai = new GoogleGenAI ({}); // Define user prompt const contents = [ { role : 'user' , parts : [{ text : 'Turn the lights down to a romantic level' }] } ]; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( response . functionCalls [ 0 ]); The model then returns a functionCall object in an OpenAPI compatible schema specifying how to call one or more of the declared functions in order to respond to the user's question. Python id = None args = { 'color_temp' : 'warm' , 'brightness' : 25 } name = 'set_light_values' JavaScript { name : 'set_light_values' , args : { brightness : 25 , color_temp : 'warm' } } Step 3: Execute set_light_values function code Extract the function call details from the model's response, parse the arguments , and execute the set_light_values function. Python # Extract tool call details, it may not be in the first part. tool_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call if tool_call . name == "set_light_values" : result = set_light_values ( ** tool_call . args ) print ( f "Function execution result: { result } " ) JavaScript // Extract tool call details const tool_call = response . functionCalls [ 0 ] let result ; if ( tool_call . name === 'set_light_values' ) { result = \ No newline at end of file diff --git a/docstore/d5881606-cbec-40bb-9397-dc1eaa00684f b/docstore/d5881606-cbec-40bb-9397-dc1eaa00684f new file mode 100644 index 0000000000000000000000000000000000000000..c35e4b7a988a5975b73d58ef99727df7844cf341 --- /dev/null +++ b/docstore/d5881606-cbec-40bb-9397-dc1eaa00684f @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting-strategies#under-the-hood Title: Prompt design strategies | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/d5b7f332-733e-4f0b-901a-ac5d8cc81330 b/docstore/d5b7f332-733e-4f0b-901a-ac5d8cc81330 new file mode 100644 index 0000000000000000000000000000000000000000..4403c8e8ebca16251f4875b8e14907f4412efbd1 --- /dev/null +++ b/docstore/d5b7f332-733e-4f0b-901a-ac5d8cc81330 @@ -0,0 +1 @@ +"role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Hello!" } ], stream = True ) for chunk in response : print ( chunk . choices [ 0 ] . delta ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function main () { const completion = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : [ { "role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Hello!" } ], stream : true , }); for await ( const chunk of completion ) { console . log ( chunk . choices [ 0 ]. delta . content ); } } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.0-flash", "messages": [ {"role": "user", "content": "Explain to me how AI works"} ], "stream": true }' Function calling Function calling makes it easier for you to get structured data outputs from generative models and is supported in the Gemini API . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) tools = [ { "type" : "function" , "function" : { "name" : "get_weather" , "description" : "Get the weather in a given location" , "parameters" : { "type" : "object" , "properties" : { "location" : { "type" : "string" , "description" : "The city and state, e.g. Chicago, IL" , }, "unit" : { "type" : "string" , "enum" : [ "celsius" , "fahrenheit" ]}, }, "required" : [ "location" ], }, } } ] messages = [{ "role" : "user" , "content" : "What's the weather like in Chicago today?" }] response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = messages , tools = tools , tool_choice = "auto" ) print ( response ) JavaScript import \ No newline at end of file diff --git a/docstore/d5bc1f2a-80b1-4731-b8d8-74bec57cfd49 b/docstore/d5bc1f2a-80b1-4731-b8d8-74bec57cfd49 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/d5bc1f2a-80b1-4731-b8d8-74bec57cfd49 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/d5c1cb07-72f9-4a1f-9680-b93cdbdd71a3 b/docstore/d5c1cb07-72f9-4a1f-9680-b93cdbdd71a3 new file mode 100644 index 0000000000000000000000000000000000000000..e83e80a217e9ff779ee42e75d11ac96b01a2f185 --- /dev/null +++ b/docstore/d5c1cb07-72f9-4a1f-9680-b93cdbdd71a3 @@ -0,0 +1 @@ +function_calling_config = types . FunctionCallingConfig ( mode = "ANY" , allowed_function_names = [ "get_current_temperature" ] ) ) # Create the generation config config = types . GenerateContentConfig ( tools = [ tools ], # not defined here. tool_config = tool_config , ) JavaScript import { FunctionCallingConfigMode } from '@google/genai' ; // Configure function calling mode const toolConfig = { functionCallingConfig : { mode : FunctionCallingConfigMode . ANY , allowedFunctionNames : [ 'get_current_temperature' ] } }; // Create the generation config const config = { tools : tools , // not defined here. toolConfig : toolConfig , }; Automatic function calling (Python only) When using the Python SDK, you can provide Python functions directly as tools. The SDK automatically converts the Python function to declarations, handles the function call execution and the response cycle for you. The Python SDK then automatically: Detects function call responses from the model. Call the corresponding Python function in your code. Sends the function response back to the model. Returns the model's final text response. To use this, define your function with type hints and a docstring, and then pass the function itself (not a JSON declaration) as a tool: Python from google import genai from google.genai import types # Define the function with type hints and docstring def get_current_temperature ( location : str ) - > dict : """Gets the current temperature for a given location. Args: location: The city and state, e.g. San Francisco, CA Returns: A dictionary containing the temperature and unit. """ # ... (implementation) ... return { "temperature" : 25 , "unit" : "Celsius" } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_current_temperature ] ) # Pass the function itself # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What's the temperature in Boston?" , config = config \ No newline at end of file diff --git a/docstore/d5e75323-23d4-4e0b-ad34-5bffd1424713 b/docstore/d5e75323-23d4-4e0b-ad34-5bffd1424713 new file mode 100644 index 0000000000000000000000000000000000000000..af04e93a60e88c829b3aa07a575f51b8b1fa9ccf --- /dev/null +++ b/docstore/d5e75323-23d4-4e0b-ad34-5bffd1424713 @@ -0,0 +1 @@ +TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : false , // default startOfSpeechSensitivity : StartSensitivity . START_SENSITIVITY_LOW , endOfSpeechSensitivity : EndSensitivity . END_SENSITIVITY_LOW , prefixPaddingMs : 20 , silenceDurationMs : 100 , } } }; Disable automatic VAD Alternatively, the automatic VAD can be disabled by setting realtimeInputConfig.automaticActivityDetection.disabled to true in the setup message. In this configuration the client is responsible for detecting user speech and sending activityStart and activityEnd messages at the appropriate times. An audioStreamEnd isn't sent in this configuration. Instead, any interruption of the stream is marked by an activityEnd message. Python config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : True }}, } async with client . aio . live . connect ( model = model , config = config ) as session : # ... await session . send_realtime_input ( activity_start = types . ActivityStart ()) await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) await session . send_realtime_input ( activity_end = types . ActivityEnd ()) # ... JavaScript const config = { responseModalities : [ Modality . TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : true , } } }; session . sendRealtimeInput ({ activityStart : {} }) session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); session . sendRealtimeInput ({ activityEnd : {} }) Token count You can find the total number of consumed tokens in the usageMetadata field of the returned server message. Python async for message in session . receive (): # The server will periodically send messages that include UsageMetadata. if message . usage_metadata : usage = message . usage_metadata print ( f "Used { usage . total_token_count } tokens in total. Response token \ No newline at end of file diff --git a/docstore/d5f7732f-430b-43fd-a06e-c80fdfa0159d b/docstore/d5f7732f-430b-43fd-a06e-c80fdfa0159d new file mode 100644 index 0000000000000000000000000000000000000000..e7df3841ca47d093238d5ecfeb9f7cc95942f8e5 --- /dev/null +++ b/docstore/d5f7732f-430b-43fd-a06e-c80fdfa0159d @@ -0,0 +1 @@ +data image2Path := "path/to/image2.jpeg" imgBytes , _ := os . ReadFile ( image2Path ) parts := [] * genai . Part { genai . NewPartFromText ( "What is different between these two images?" ), genai . NewPartFromBytes ( imgBytes , "image/jpeg" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST # Upload the first image IMAGE1_PATH = "path/to/image1.jpg" MIME1_TYPE = $( file -b --mime-type " ${ IMAGE1_PATH } " ) NUM1_BYTES = $( wc -c < " ${ IMAGE1_PATH } " ) DISPLAY_NAME1 = IMAGE1 tmp_header_file1 = upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME1_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME1 } '}}" 2 > /dev/null upload_url1 = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file1 } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file1 } " curl " ${ upload_url1 } " \ -H "Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE1_PATH } " 2 > /dev/null > file_info1.json file1_uri = $( jq ".file.uri" file_info1.json ) echo file1_uri = $file1_uri # Prepare the second image (inline) IMAGE2_PATH = "path/to/image2.png" MIME2_TYPE = $( file -b --mime-type " ${ IMAGE2_PATH } " ) if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMAGE2_BASE64 = $( base64 $B64FLAGS $IMAGE2_PATH ) # Now generate content using both images curl \ No newline at end of file diff --git a/docstore/d6102ff8-0bfe-4155-b509-6e31fb941da8 b/docstore/d6102ff8-0bfe-4155-b509-6e31fb941da8 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/d6102ff8-0bfe-4155-b509-6e31fb941da8 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/d6104dd8-8507-4b65-a2be-79b4b82dac83 b/docstore/d6104dd8-8507-4b65-a2be-79b4b82dac83 new file mode 100644 index 0000000000000000000000000000000000000000..a97f8e9b4e6ce1d85f8c4b9bed31a9676bb7c602 --- /dev/null +++ b/docstore/d6104dd8-8507-4b65-a2be-79b4b82dac83 @@ -0,0 +1 @@ +Files API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Files API The Gemini family of artificial intelligence (AI) models is built to handle various types of input data, including text, images, and audio. Since these models can handle more than one type or mode of data, the Gemini models are called multimodal models or explained as having multimodal capabilities . This guide shows you how to work with media files using the Files API. The basic operations are the same for audio files, images, videos, documents, and other supported file types. For file prompting guidance, check out the File prompt guide section. Upload a file You can use the Files API to upload a media file. Always use the Files API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20 MB. The following code uploads a file and then uses the file in a call to generateContent . Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp3" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ "Describe this audio clip" , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Describe this audio \ No newline at end of file diff --git a/docstore/d61f34b9-adad-4712-aa2e-085b1dea1479 b/docstore/d61f34b9-adad-4712-aa2e-085b1dea1479 new file mode 100644 index 0000000000000000000000000000000000000000..40517314fd91c121847408df8a1f7fc600adf0b3 --- /dev/null +++ b/docstore/d61f34b9-adad-4712-aa2e-085b1dea1479 @@ -0,0 +1 @@ +string, "nullable": boolean, "enum": [ string ], "maxItems": integer, "minItems": integer, "properties": { string: { object (Schema) }, ... }, "required": [ string ], "propertyOrdering": [ string ], "items": { object (Schema) } } The Type of the schema must be one of the OpenAPI Data Types , or a union of those types (using anyOf ). Only a subset of fields is valid for each Type . The following list maps each Type to a subset of the fields that are valid for that type: string -> enum , format , nullable integer -> format , minimum , maximum , enum , nullable number -> format , minimum , maximum , enum , nullable boolean -> nullable array -> minItems , maxItems , items , nullable object -> properties , required , propertyOrdering , nullable Here are some example schemas showing valid type-and-field combinations: { "type" : "string" , "enum" : [ "a" , "b" , "c" ] } { "type" : "string" , "format" : "date-time" } { "type" : "integer" , "format" : "int64" } { "type" : "number" , "format" : "double" } { "type" : "boolean" } { "type" : "array" , "minItems" : 3 , "maxItems" : 3 , "items" : { "type" : ... } } { "type" : "object" , "properties" : { "a" : { "type" : ... }, "b" : { "type" : ... }, "c" : { "type" : ... } }, "nullable" : true , "required" : [ "c" ], "propertyOrdering" : [ "c" , "b" , "a" ] } For complete documentation of the Schema fields as they're used in the Gemini API, see the Schema reference . Property ordering Warning: When you're configuring a JSON schema, make sure to set propertyOrdering[] , and when you provide examples, make sure that the property ordering in the examples matches the schema. When you're working with JSON schemas in the Gemini API, the order of properties is important. By default, the API orders properties alphabetically and does not preserve the order in which the properties are defined (although the Google Gen AI SDKs may preserve this order). If you're providing examples to the model with a schema configured, and the property \ No newline at end of file diff --git a/docstore/d6253408-3003-48bd-a54f-7ac16f975514 b/docstore/d6253408-3003-48bd-a54f-7ac16f975514 new file mode 100644 index 0000000000000000000000000000000000000000..08922eb1e5da83e7a67a2a4aeaf4437890d1333a --- /dev/null +++ b/docstore/d6253408-3003-48bd-a54f-7ac16f975514 @@ -0,0 +1 @@ +trademark of Oracle and/or its affiliates. Last updated 2025-05-31 UTC. \ No newline at end of file diff --git a/docstore/d628a2db-7e6e-4918-8f3d-1b2d441dda6a b/docstore/d628a2db-7e6e-4918-8f3d-1b2d441dda6a new file mode 100644 index 0000000000000000000000000000000000000000..485847fd8e226bc46bd8d42c44cd3e8dd100fb7e --- /dev/null +++ b/docstore/d628a2db-7e6e-4918-8f3d-1b2d441dda6a @@ -0,0 +1 @@ +supported on the interactive (or non-batch mode) API. Pricing: Batch Mode usage is priced at 50% of the standard interactive API cost for the equivalent model. Service Level Objective (SLO): Batch jobs are designed to complete within a 24-hour turnaround time. Many jobs may complete much faster depending on their size and current system load. Caching: Context caching is enabled for batch requests. If a request in your batch results in a cache hit, the cached tokens are priced the same as for non-batch mode traffic. Best practices Use input files for large requests: For a large number of requests, always use the file input method for better manageability and to avoid hitting request size limits for the BatchGenerateContent call itself. Note that there's a the 2GB file size limit per input file. Error handling: Check the batchStats for failedRequestCount after a job completes. If using file output, parse each line to check if it's a GenerateContentResponse or a status object indicating an error for that specific request. Submit jobs once: The creation of a batch job is not idempotent. If you send the same creation request twice, two separate batch jobs will be created. Break up very large batches: While the target turnaround time is 24 hours, actual processing time can vary based on system load and job size. For large jobs, consider breaking them into smaller batches if intermediate results are needed sooner. What's next Check out the batch mode notebook for more examples. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/d64998a8-a32a-4af5-b37a-681536e0d2dd b/docstore/d64998a8-a32a-4af5-b37a-681536e0d2dd new file mode 100644 index 0000000000000000000000000000000000000000..276b272d06bde6f464a4287876b38d8d2bc17eb7 --- /dev/null +++ b/docstore/d64998a8-a32a-4af5-b37a-681536e0d2dd @@ -0,0 +1 @@ +Migrate to the Google GenAI SDK | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Migrate to the Google GenAI SDK Starting with the Gemini 2.0 release in late 2024, we introduced a new set of libraries called the Google GenAI SDK . It offers an improved developer experience through an updated client architecture , and simplifies the transition between developer and enterprise workflows. The Google GenAI SDK is now in General Availability (GA) across all supported platforms. If you're using one of our legacy libraries , we strongly recommend you to migrate. This guide provides before-and-after examples of migrated code to help you get started. Note: The Go examples omit imports and other boilerplate code to improve readability. Installation Before Python pip install -U -q "google-generativeai" JavaScript npm install @google/generative-ai Go go get github.com/google/generative-ai-go After Python pip install -U -q "google-genai" JavaScript npm install @google/genai Go go get google.golang.org/genai API access The old SDK implicitly handled the API client behind the scenes using a variety of ad hoc methods. This made it hard to manage the client and credentials. Now, you interact through a central Client object. This Client object acts as a single entry point for various API services (e.g., models , chats , files , tunings ), promoting consistency and simplifying credential and configuration management across different API calls. Before (Less Centralized API Access) Python The old SDK didn't explicitly use a top-level client object for most API calls. You would directly instantiate and interact with GenerativeModel objects. import \ No newline at end of file diff --git a/docstore/d649de56-3684-4d54-b749-f028dfc5a660 b/docstore/d649de56-3684-4d54-b749-f028dfc5a660 new file mode 100644 index 0000000000000000000000000000000000000000..d464a7e5141c7bcc5fa86ba919979db27614ba5c --- /dev/null +++ b/docstore/d649de56-3684-4d54-b749-f028dfc5a660 @@ -0,0 +1 @@ +Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/d64bd318-8874-4b29-bcdd-90e23bd58b16 b/docstore/d64bd318-8874-4b29-bcdd-90e23bd58b16 new file mode 100644 index 0000000000000000000000000000000000000000..cb7319bb995c708a315638a1177ba448f5c39210 --- /dev/null +++ b/docstore/d64bd318-8874-4b29-bcdd-90e23bd58b16 @@ -0,0 +1 @@ +: "Powers the spinning disco ball." , "parameters" : { "type" : "object" , "properties" : { "power" : { "type" : "boolean" , "description" : "Whether to turn the disco ball on or off." , } }, "required" : [ "power" ], }, } start_music = { "name" : "start_music" , "description" : "Play some music matching the specified parameters." , "parameters" : { "type" : "object" , "properties" : { "energetic" : { "type" : "boolean" , "description" : "Whether the music is energetic or not." , }, "loud" : { "type" : "boolean" , "description" : "Whether the music is loud or not." , }, }, "required" : [ "energetic" , "loud" ], }, } dim_lights = { "name" : "dim_lights" , "description" : "Dim the lights." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "number" , "description" : "The brightness of the lights, 0.0 is off, 1.0 is full." , } }, "required" : [ "brightness" ], }, } JavaScript import { Type } from '@google/genai' ; const powerDiscoBall = { name : 'power_disco_ball' , description : 'Powers the spinning disco ball.' , parameters : { type : Type . OBJECT , properties : { power : { type : Type . BOOLEAN , description : 'Whether to turn the disco ball on or off.' } }, required : [ 'power' ] } }; const startMusic = { name : 'start_music' , description : 'Play some music matching the specified parameters.' , parameters : { type : Type . OBJECT , properties : { energetic : { type : Type . BOOLEAN , description : 'Whether the music is energetic or not.' }, loud : { type : Type . BOOLEAN , description : 'Whether the music is loud or not.' } }, required : [ 'energetic' , 'loud' ] } }; const dimLights = { name : 'dim_lights' , description : 'Dim the lights.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'The brightness of the lights, 0.0 is off, 1.0 is full.' } }, required : [ 'brightness' ] } }; Configure the function calling mode to allow using all of the specified tools. To learn more, \ No newline at end of file diff --git a/docstore/d659dae6-b38a-4efe-bdd1-a10607a6039a b/docstore/d659dae6-b38a-4efe-bdd1-a10607a6039a new file mode 100644 index 0000000000000000000000000000000000000000..6753b1d43ae21393f7ea777aad5afc4becddf540 --- /dev/null +++ b/docstore/d659dae6-b38a-4efe-bdd1-a10607a6039a @@ -0,0 +1 @@ +response will include a thought_signature field containing an encrypted representation of the model's reasoning. Return the signature: When you send the function's execution result back to the server, include the thought_signature you received. This allows the model to restore its previous thinking context and will likely result in better function calling performance. Receiving signatures from the server Signatures are returned in the part after the model's thinking phase, which typically is a text or function call. Here are some examples of what thought signatures look like returned in each type of part, in response to the request "What's the weather in Lake Tahoe?" using the Get Weather example: Text part [{ "candidates" : [ { "content" : { "parts" : [ { "text" : "Here's what the weather in Lake Tahoe is today" , "thoughtSignature" : "ClcBVKhc7ru7KzUI7SrdUoIdAYLm/+i93aHjfIt4xHyAoO/G70tApxnK2ujBhOhC1PrRy1pkQa88fqFvpHNVd1HDjNLO7mkp6/hFwE+SPPEB3fh0hs4oM8MKhgIBVKhc7uIGvrS7i/T4HpfbnYrluFfWNjZ62gewqe4cVdR/Dlh+zbjtYmDD0gPZ+SuBO7vvHQdzsjePRP+2Y5XddX6LEf/cGGgakq8EhVvw/a6IVzUO6XmpHg2Ag1sl8E9+VFH/lC0R0ZuYdFWligtDuYwp5p5q3o59G0TtWeU2MC1y2MJfE9u/KWd313ldka80/X2W/xF2O/4djMp5G2WKcULfve75zeRCy0mc5iS3SB9mTH0cT6x0vtKjeBx50gcg+CQWtJcRuwTVzz54dmvmK9xvnqA8gKGw3DuaM9wfy5hyY7Qg0z3iyyWdP8T/lbjKim8IEQOk7O1vVwP1Ko7oMYH8JgA1CsoBAVSoXO6v4c5RSyd1cn6EIU0pEFQsjW7rYWPuZdOFq/tsGJT9BCfW7KGkPGwlNSq8jTJFvbcJ/DjtndISQYXwiXd2kGa5JfdS2Kh4zOxCxiWtOk+2nCc3+XQk2nonhO+esGJpkDdbbHZSqRgcUtYKq7q28iPFOQvOFyCiZNB7K86Z/6Hnagu2snSlN/BcTMaFGaWpcCClSUo4foRZn3WbNCoM8rcpD7qEJMp4a5baaSxyyeL1ZTGd2HLpFys/oiW6e3oAnhxuIysCwg==" } ] , "role" : "model" } , "index" : 0 } ] , # Remainder of response... Function call part [{ "candidates" : [ { "content" : { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/d65d40a0-5510-4910-b283-5565ebc29d61 b/docstore/d65d40a0-5510-4910-b283-5565ebc29d61 new file mode 100644 index 0000000000000000000000000000000000000000..5ec9ece3755d9588328841497099189b5d8c6b8d --- /dev/null +++ b/docstore/d65d40a0-5510-4910-b283-5565ebc29d61 @@ -0,0 +1 @@ +5. Forever Blooms Task Give me a simple list of just the things that I must bring on a camping trip. The list should have 5 items. 1. **Tent:** Your shelter from the elements. 2. **Sleeping Bag:** Stay warm and comfortable. 3. **Sleeping Pad:** Insulation and cushioning for a good night's sleep. 4. **Headlamp or Flashlight:** Light for nighttime activities. 5. **First Aid Kit:** Be prepared for minor injuries. Entity Classify the following items as [large, small]: Elephant Mouse Snail Here's the classification: * **Elephant:** Large * **Mouse:** Small * **Snail:** Small Partial input completion Generative language models work like an advanced auto completion tool. When you provide partial content, the model can provide the rest of the content or what it thinks is a continuation of that content as a response. When doing so, if you include any examples or context, the model can take those examples or context into account. The following example provides a prompt with an instruction and an entity input: Prompt: For the given order, return a JSON object that has the fields cheeseburger, hamburger, fries, or drink, with the value being the quantity. Order: A burger and a drink. Response: { "cheeseburger": 0, "hamburger": 1, "fries": 0, "drink": 1 } (gemini-2.5-flash) While the model did as prompted, writing out the instructions in natural language can sometimes be challenging and it leaves a lot to the model's interpretation. For example, a restaurants menu might contain many items. To reduce the size of the JSON response, you probably want to omit the items that weren't ordered. In this case, you can give an example and a response prefix and let the model complete it: Prompt: Valid fields are cheeseburger, hamburger, fries, and drink. Order: Give me a cheeseburger and fries Output: ``` { "cheeseburger": 1, "fries": 1 } ``` Order: I want two burgers, a drink, and fries. Output: Response: ``` { "hamburger": 2, "drink": 1, "fries": 1 } ``` (gemini-2.5-flash) Notice how \ No newline at end of file diff --git a/docstore/d66ebddc-ee88-4e6c-b544-5b183e750bc5 b/docstore/d66ebddc-ee88-4e6c-b544-5b183e750bc5 new file mode 100644 index 0000000000000000000000000000000000000000..bdac38d149644927b7d126aee3930efd52a696eb --- /dev/null +++ b/docstore/d66ebddc-ee88-4e6c-b544-5b183e750bc5 @@ -0,0 +1 @@ +turn_off_the_lights_schema = { 'name' : 'turn_off_the_lights' } prompt = """ Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? """ tools = [ { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} ] await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Light control schemas const turnOnTheLightsSchema = { name : 'turn_on_the_lights' }; const turnOffTheLightsSchema = { name : 'turn_off_the_lights' }; const prompt = ` Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? ` ; const tools = [ { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } ]; await run ( prompt , tools = tools , modality = "AUDIO" ) Function calling modes The Gemini API lets you control how the model uses the provided tools (function declarations). Specifically, you can set the mode within the. function_calling_config . AUTO (Default) : The model decides whether to generate a natural language response or suggest a function call based on the prompt and context. This is the most flexible mode and recommended for most scenarios. ANY : The model is constrained to always predict a function call and guarantees function schema adherence. If allowed_function_names is not specified, the model can choose from any of the provided function declarations. If allowed_function_names is provided as a list, the model can only choose from the functions in that list. Use this mode when you require a function call response to every prompt (if applicable). NONE : The model is prohibited from making function calls. This is equivalent to sending a request without any function declarations. Use this to temporarily disable function calling without removing your tool definitions. Python from google.genai import types # Configure function calling mode tool_config = types . ToolConfig ( \ No newline at end of file diff --git a/docstore/d66f10c4-8a6b-4ee2-bae6-956f31d09e7c b/docstore/d66f10c4-8a6b-4ee2-bae6-956f31d09e7c new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/d66f10c4-8a6b-4ee2-bae6-956f31d09e7c @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/d6898b38-6856-4921-8603-4a34464f5cb0 b/docstore/d6898b38-6856-4921-8603-4a34464f5cb0 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/d6898b38-6856-4921-8603-4a34464f5cb0 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/d68ad469-8e03-4ce4-9289-8e1137e9232d b/docstore/d68ad469-8e03-4ce4-9289-8e1137e9232d new file mode 100644 index 0000000000000000000000000000000000000000..fcdd74a2066e8337e87bd4f6deeda8c36aff5869 --- /dev/null +++ b/docstore/d68ad469-8e03-4ce4-9289-8e1137e9232d @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/text-generation Title: Text generation | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/d68c4c7b-b4c5-4ca9-83f4-e0b7663f288d b/docstore/d68c4c7b-b4c5-4ca9-83f4-e0b7663f288d new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/d68c4c7b-b4c5-4ca9-83f4-e0b7663f288d @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/d68da206-59d9-4af4-af06-2f33cbf614a1 b/docstore/d68da206-59d9-4af4-af06-2f33cbf614a1 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/d68da206-59d9-4af4-af06-2f33cbf614a1 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/d6be2c3a-696e-46d5-96ed-c3152a216021 b/docstore/d6be2c3a-696e-46d5-96ed-c3152a216021 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/d6be2c3a-696e-46d5-96ed-c3152a216021 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/d6c4822d-6a7f-43ec-acc4-87fac78a69d2 b/docstore/d6c4822d-6a7f-43ec-acc4-87fac78a69d2 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/d6c4822d-6a7f-43ec-acc4-87fac78a69d2 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/d6dd9862-6f07-4af8-9f9f-03875aefeb92 b/docstore/d6dd9862-6f07-4af8-9f9f-03875aefeb92 new file mode 100644 index 0000000000000000000000000000000000000000..43d235dee43f472c269052714c9705874463b9cd --- /dev/null +++ b/docstore/d6dd9862-6f07-4af8-9f9f-03875aefeb92 @@ -0,0 +1 @@ +tools ]) # Define user prompt contents = [ types . Content ( role = "user" , parts = [ types . Part ( text = "Turn the lights down to a romantic level" )] ) ] # Send request with function declarations response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = contents config = config , ) print ( response . candidates [ 0 ] . content . parts [ 0 ] . function_call ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Generation config with function declaration const config = { tools : [{ functionDeclarations : [ setLightValuesFunctionDeclaration ] }] }; // Configure the client const ai = new GoogleGenAI ({}); // Define user prompt const contents = [ { role : 'user' , parts : [{ text : 'Turn the lights down to a romantic level' }] } ]; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( response . functionCalls [ 0 ]); The model then returns a functionCall object in an OpenAPI compatible schema specifying how to call one or more of the declared functions in order to respond to the user's question. Python id = None args = { 'color_temp' : 'warm' , 'brightness' : 25 } name = 'set_light_values' JavaScript { name : 'set_light_values' , args : { brightness : 25 , color_temp : 'warm' } } Step 3: Execute set_light_values function code Extract the function call details from the model's response, parse the arguments , and execute the set_light_values function. Python # Extract tool call details, it may not be in the first part. tool_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call if tool_call . name == "set_light_values" : result = set_light_values ( ** tool_call . args ) print ( f "Function execution result: { result } " ) JavaScript // Extract tool call details const tool_call = response . functionCalls [ 0 ] let result ; if ( tool_call . name === 'set_light_values' ) { result = \ No newline at end of file diff --git a/docstore/d6de9ec4-c43f-401b-b09c-aab1c1d76e07 b/docstore/d6de9ec4-c43f-401b-b09c-aab1c1d76e07 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/d6de9ec4-c43f-401b-b09c-aab1c1d76e07 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/d6e11b7c-03f1-4366-9520-3372116ab03f b/docstore/d6e11b7c-03f1-4366-9520-3372116ab03f new file mode 100644 index 0000000000000000000000000000000000000000..b69246ffb0dbd000257d7c3766a3c0948ac629b8 --- /dev/null +++ b/docstore/d6e11b7c-03f1-4366-9520-3372116ab03f @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/semantic_retrieval#main-content Title: Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/d701ac04-3f1b-415c-9651-19f53b1cfd02 b/docstore/d701ac04-3f1b-415c-9651-19f53b1cfd02 new file mode 100644 index 0000000000000000000000000000000000000000..5df22ab370b3c2108c2a4e677731cc4af2835ff9 --- /dev/null +++ b/docstore/d701ac04-3f1b-415c-9651-19f53b1cfd02 @@ -0,0 +1 @@ +variables, if you don't pass one to the client. export GEMINI_API_KEY = "YOUR_API_KEY" from google import genai client = genai . Client () # Set the API key using the GEMINI_API_KEY env var. # Alternatively, you could set the API key explicitly: # client = genai.Client(api_key="your_api_key") JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); Go Import the GenAI library: import "google.golang.org/genai" Create the client: client , err := genai . NewClient ( ctx , & genai . ClientConfig { Backend : genai . BackendGeminiAPI , }) Generate content Text Before Python Previously, there were no client objects, you accessed APIs directly through GenerativeModel objects. import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ( 'Tell me a story in 300 words' ) print ( response . text ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( process . env . API_KEY ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); const prompt = "Tell me a story in 300 words" ; const result = await model . generateContent ( prompt ); console . log ( result . response . text ()); Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( "gemini-1.5-flash" ) resp , err := model . GenerateContent ( ctx , genai . Text ( "Tell me a story in 300 words." )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) // utility for printing response parts After Python The new Google GenAI SDK provides access to all the API methods through the Client object. Except for a few stateful special cases ( chat and live-api session s), these are all stateless functions. For utility and uniformity, objects returned are pydantic classes. \ No newline at end of file diff --git a/docstore/d7072d8c-7b1d-43fe-a87e-1fa0834ec0d2 b/docstore/d7072d8c-7b1d-43fe-a87e-1fa0834ec0d2 new file mode 100644 index 0000000000000000000000000000000000000000..0c68486ff0914a4c906e879b3931f6af31f32426 --- /dev/null +++ b/docstore/d7072d8c-7b1d-43fe-a87e-1fa0834ec0d2 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/sdks#install Title: Gemini API libraries | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/d7078aef-7955-46cd-aa5f-39541c29a298 b/docstore/d7078aef-7955-46cd-aa5f-39541c29a298 new file mode 100644 index 0000000000000000000000000000000000000000..6ba79c2c7031e707d89e2b23470a5d9ee375cc5c --- /dev/null +++ b/docstore/d7078aef-7955-46cd-aa5f-39541c29a298 @@ -0,0 +1 @@ +mcp.client.stdio import stdio_client from google import genai client = genai . Client () # Create server parameters for stdio connection server_params = StdioServerParameters ( command = "npx" , # Executable args = [ "-y" , "@philschmid/weather-mcp" ], # MCP Server env = None , # Optional environment variables ) async def run (): async with stdio_client ( server_params ) as ( read , write ): async with ClientSession ( read , write ) as session : # Prompt to get the weather for the current day in London. prompt = f "What is the weather in London in { datetime . now () . strftime ( '%Y-%m- %d ' ) } ?" # Initialize the connection between client and server await session . initialize () # Send request to the model with MCP function declarations response = await client . aio . models . generate_content ( model = "gemini-2.5-flash" , contents = prompt , config = genai . types . GenerateContentConfig ( temperature = 0 , tools = [ session ], # uses the session, will automatically call the tool # Uncomment if you **don't** want the SDK to automatically call the tool # automatic_function_calling=genai.types.AutomaticFunctionCallingConfig( # disable=True # ), ), ) print ( response . text ) # Start the asyncio event loop and run the main function asyncio . run ( run ()) JavaScript Make sure the latest version of the mcp SDK is installed on your platform of choice. npm install @modelcontextprotocol/sdk Note: JavaScript supports automatic tool calling by wrapping the client with mcpToTool . If you want to disable it, you can provide automaticFunctionCalling with disabled true . import { GoogleGenAI , FunctionCallingConfigMode , mcpToTool } from '@google/genai' ; import { Client } from "@modelcontextprotocol/sdk/client/index.js" ; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js" ; // Create server parameters for stdio connection const serverParams = new StdioClientTransport ({ command : "npx" , // Executable args : [ "-y" , "@philschmid/weather-mcp" \ No newline at end of file diff --git a/docstore/d7206141-5c48-4be4-a648-41dbd55f2fae b/docstore/d7206141-5c48-4be4-a648-41dbd55f2fae new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/d7206141-5c48-4be4-a648-41dbd55f2fae @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/d72d65a6-323f-49db-9aea-9ed7bd32fd7a b/docstore/d72d65a6-323f-49db-9aea-9ed7bd32fd7a new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/d72d65a6-323f-49db-9aea-9ed7bd32fd7a @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/d766f388-2aa8-4e9b-9f21-fe670ae903f4 b/docstore/d766f388-2aa8-4e9b-9f21-fe670ae903f4 new file mode 100644 index 0000000000000000000000000000000000000000..987408a9d5fce99ac4990dd87d63b0d860b9553d --- /dev/null +++ b/docstore/d766f388-2aa8-4e9b-9f21-fe670ae903f4 @@ -0,0 +1 @@ +overwhelming dominance of digital streaming and on-demand services; a "smart" approach would therefore require a highly specialized niche, a strong value proposition beyond just the disc, and likely a focus on community or collector markets rather than mainstream consumption. Here's a breakdown of what a smart DVD business in 2025 would likely entail: 1. Extreme Niche Specialization: You cannot compete with streaming libraries. Your business must focus on what digital doesn't offer easily or at all. This could include: * Out-of-Print & Rare Titles: Sourcing films or TV shows that have never been released digitally or are currently unavailable on major streaming platforms. .... (gemini-2.5-flash) Format responses with the completion strategy The completion strategy can also help format the response. The following example prompts the model to create an essay outline: Prompt: Create an outline for an essay about hummingbirds. Response: Here is a possible outline for an essay about hummingbirds: I. Introduction A. Hook: Start with a captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). B. Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds. C. Thesis Statement: State the main point of the essay, which could be about the remarkable adaptations, behaviors, and ecological importance of hummingbirds. II. Appearance and Identification ..... (gemini-2.5-flash) The prompt didn't specify the format for the outline and the model chose a format for you. To get the model to return an outline in a specific format, you can add text that represents the start of the outline and let the model complete it based on the pattern that you initiated. Prompt: Create an outline for an essay about hummingbirds. I. Introduction * Response: Okay, here is an outline for an essay about hummingbirds, starting from the introduction you provided: I. Introduction * Hook: Start with a \ No newline at end of file diff --git a/docstore/d769290e-7c88-49f0-ae18-b3a2dcffd3fa b/docstore/d769290e-7c88-49f0-ae18-b3a2dcffd3fa new file mode 100644 index 0000000000000000000000000000000000000000..29805a750d326aab08740367fc13678c1846ec09 --- /dev/null +++ b/docstore/d769290e-7c88-49f0-ae18-b3a2dcffd3fa @@ -0,0 +1 @@ +default. Here, you disable it. from google import genai from google.genai import types client = genai . Client () def get_current_weather ( location : str ) - > str : """Get the current whether in a given location. Args: location: required, The city and state, e.g. San Franciso, CA unit: celsius or fahrenheit """ print ( f 'Called with: { location =} ' ) return "23C" response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = "What is the weather like in Boston?" , config = types . GenerateContentConfig ( tools = [ get_current_weather ], automatic_function_calling = { 'disable' : True }, ), ) function_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call Automatic function calling Before Python The old SDK only supports automatic function calling in chat. In the new SDK this is the default behavior in generate_content . import google.generativeai as genai def get_current_weather ( city : str ) - > str : return "23C" model = genai . GenerativeModel ( model_name = "gemini-1.5-flash" , tools = [ get_current_weather ] ) chat = model . start_chat ( enable_automatic_function_calling = True ) result = chat . send_message ( "What is the weather in San Francisco?" ) After Python from google import genai from google.genai import types client = genai . Client () def get_current_weather ( city : str ) - > str : return "23C" response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = "What is the weather like in Boston?" , config = types . GenerateContentConfig ( tools = [ get_current_weather ] ), ) Code execution Code execution is a tool that allows the model to generate Python code, run it, and return the result. Before Python import google.generativeai as genai model = genai . GenerativeModel ( model_name = "gemini-1.5-flash" , tools = "code_execution" ) result = model . generate_content ( "What is the sum of the first 50 prime numbers? Generate and run code for " "the calculation, and make sure you \ No newline at end of file diff --git a/docstore/d76ed4c7-f746-460e-846e-69d8687d8585 b/docstore/d76ed4c7-f746-460e-846e-69d8687d8585 new file mode 100644 index 0000000000000000000000000000000000000000..bbc4019685cdf16085ca79e5df30b3ebeb71657f --- /dev/null +++ b/docstore/d76ed4c7-f746-460e-846e-69d8687d8585 @@ -0,0 +1 @@ +"role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . server_content . model_turn : print ( "Model turn:" , response . server_content . model_turn ) if response . server_content . output_transcription : print ( "Transcript:" , response . server_content . output_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ], outputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . debug ( 'Received output transcription: %s\n' , turn . serverContent . outputTranscription . text ); } } session . close (); } async function \ No newline at end of file diff --git a/docstore/d78bf36b-ae62-4bd2-9469-9dda03cb2c7d b/docstore/d78bf36b-ae62-4bd2-9469-9dda03cb2c7d new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/d78bf36b-ae62-4bd2-9469-9dda03cb2c7d @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/d7bcc14c-8edb-4204-ab0e-93cf32fb1db6 b/docstore/d7bcc14c-8edb-4204-ab0e-93cf32fb1db6 new file mode 100644 index 0000000000000000000000000000000000000000..bdac38d149644927b7d126aee3930efd52a696eb --- /dev/null +++ b/docstore/d7bcc14c-8edb-4204-ab0e-93cf32fb1db6 @@ -0,0 +1 @@ +turn_off_the_lights_schema = { 'name' : 'turn_off_the_lights' } prompt = """ Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? """ tools = [ { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} ] await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Light control schemas const turnOnTheLightsSchema = { name : 'turn_on_the_lights' }; const turnOffTheLightsSchema = { name : 'turn_off_the_lights' }; const prompt = ` Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? ` ; const tools = [ { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } ]; await run ( prompt , tools = tools , modality = "AUDIO" ) Function calling modes The Gemini API lets you control how the model uses the provided tools (function declarations). Specifically, you can set the mode within the. function_calling_config . AUTO (Default) : The model decides whether to generate a natural language response or suggest a function call based on the prompt and context. This is the most flexible mode and recommended for most scenarios. ANY : The model is constrained to always predict a function call and guarantees function schema adherence. If allowed_function_names is not specified, the model can choose from any of the provided function declarations. If allowed_function_names is provided as a list, the model can only choose from the functions in that list. Use this mode when you require a function call response to every prompt (if applicable). NONE : The model is prohibited from making function calls. This is equivalent to sending a request without any function declarations. Use this to temporarily disable function calling without removing your tool definitions. Python from google.genai import types # Configure function calling mode tool_config = types . ToolConfig ( \ No newline at end of file diff --git a/docstore/d7c153f2-d61e-4edd-b293-24d8b8ba380e b/docstore/d7c153f2-d61e-4edd-b293-24d8b8ba380e new file mode 100644 index 0000000000000000000000000000000000000000..109bd0eaeef926ee02f7659dee842724a4ad9423 --- /dev/null +++ b/docstore/d7c153f2-d61e-4edd-b293-24d8b8ba380e @@ -0,0 +1 @@ +the libraries are stable and fully supported for production use. They are actively maintained, provide access to the latest features, and offer the best performance working with Gemini. If you're not using the Google GenAI SDK and using one of our legacy libraries, we strongly recommend you to migrate. Review the legacy libraries section for more information. Legacy libraries and migration If you are using one of our legacy libraries, we recommend that you migrate to the new libraries . The legacy libraries don't provide access to recent features (such as Live API and Veo ) and are on a deprecation path. They will stop receiving updates at the end of September 2025, the feature gaps will grow and potential bugs may no longer get fixed. Each legacy library's support status varies, detailed in the following table: Language Legacy library Support status Recommended library Python google-generativeai All support, including bug fixes, ends end of September 2025. google-genai JavaScript/TypeScript @google/generativeai All support, including bug fixes, ends end of September 2025. @google/genai Go google.golang.org/generative-ai All support, including bug fixes, ends end of September 2025. google.golang.org/genai Dart and Flutter google_generative_ai Not actively maintained Use trusted community or third party libraries, like firebase_ai , or access using REST API Swift generative-ai-swift Not actively maintained Use Gemini in Firebase Android generative-ai-android Not actively maintained Use Gemini in Firebase Note for Java developers: There was no legacy Google-provided Java SDK for the Gemini API, so no migration from a previous Google library is required. You can start directly with the new library in the Language support and installation section. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google \ No newline at end of file diff --git a/docstore/d7cdd265-6abc-41f1-989f-feeeed531195 b/docstore/d7cdd265-6abc-41f1-989f-feeeed531195 new file mode 100644 index 0000000000000000000000000000000000000000..045707d455060dfd20be0644c14272aa57ff277b --- /dev/null +++ b/docstore/d7cdd265-6abc-41f1-989f-feeeed531195 @@ -0,0 +1 @@ +"log" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } prompt := "Explain the concept of Occam's Razor and provide a simple, everyday example." model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , genai . Text ( prompt ), nil ) fmt . Println ( resp . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain the concept of Occam\' s Razor and provide a simple, everyday example. " } ] } ] }' ``` Thinking budgets The thinkingBudget parameter guides the model on the number of thinking tokens to use when generating a response. A higher token count generally allows for more detailed reasoning, which can be beneficial for tackling more complex tasks . If latency is more important, use a lower budget or disable thinking by setting thinkingBudget to 0. Setting the thinkingBudget to -1 turns on dynamic thinking , meaning the model will adjust the budget based on the complexity of the request. The thinkingBudget is only supported in Gemini 2.5 Flash, 2.5 Pro, and 2.5 Flash-Lite. Depending on the prompt, the model might overflow or underflow the token budget. The following are thinkingBudget configuration details for each model type. Model Default setting (Thinking budget is not set) Range Disable thinking Turn on dynamic thinking 2.5 Pro Dynamic thinking: Model decides when and how much to think 128 to 32768 N/A: Cannot disable thinking thinkingBudget = -1 2.5 Flash Dynamic thinking: Model decides when and how much to think 0 to 24576 thinkingBudget = 0 thinkingBudget = -1 2.5 Flash Lite Model does not think 512 to 24576 thinkingBudget = 0 thinkingBudget = -1 Python from google import genai from google.genai import types client = genai . \ No newline at end of file diff --git a/docstore/d7d1a968-d64a-41e0-8689-f2e352f89934 b/docstore/d7d1a968-d64a-41e0-8689-f2e352f89934 new file mode 100644 index 0000000000000000000000000000000000000000..7645b864913317d4ec923e00d51796055880e22d --- /dev/null +++ b/docstore/d7d1a968-d64a-41e0-8689-f2e352f89934 @@ -0,0 +1 @@ +https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:batchGenerateContent \ -X POST \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type:application/json" \ -d "{ 'batch': { 'display_name': 'my-batch-requests', 'input_config': { 'requests': { 'file_name': ${ BATCH_INPUT_FILE } } } } }" When you create a batch job, you will get a job name returned. Use this name for monitoring the job status as well as retrieving the results once the job completes. The following is an example output that contains a job name: Created batch job from file: batches/123456789 Monitoring job status Use the operation name obtained when creating the batch job to poll its status. The state field of the batch job will indicate its current status. A batch job can be in one of the following states: JOB_STATE_PENDING : The job has been created and is waiting to be processed by the service. JOB_STATE_SUCCEEDED : The job completed successfully. You can now retrieve the results. JOB_STATE_FAILED : The job failed. Check the error details for more information. JOB_STATE_CANCELLED : The job was cancelled by the user. You can poll the job status periodically to check for completion. Python # Use the name of the job you want to check # e.g., inline_batch_job.name from the previous step job_name = "YOUR_BATCH_JOB_NAME" # (e.g. 'batches/your-batch-id') batch_job = client . batches . get ( name = job_name ) completed_states = set ([ 'JOB_STATE_SUCCEEDED' , 'JOB_STATE_FAILED' , 'JOB_STATE_CANCELLED' , ]) print ( f "Polling status for job: { job_name } " ) batch_job = client . batches . get ( name = job_name ) # Initial get while batch_job . state . name not in completed_states : print ( f "Current state: { batch_job . state . name } " ) time . sleep ( 30 ) # Wait for 30 seconds before polling again batch_job = client . batches . get ( name = job_name ) print ( f "Job finished with state: { batch_job . state . name } " ) if batch_job . state . name == 'JOB_STATE_FAILED' : print ( f \ No newline at end of file diff --git a/docstore/d7de47ef-cc7c-4c89-9d34-9147b8a5bec6 b/docstore/d7de47ef-cc7c-4c89-9d34-9147b8a5bec6 new file mode 100644 index 0000000000000000000000000000000000000000..73b4d10fa85a5cdaed2f1e90e6509f07166441b9 --- /dev/null +++ b/docstore/d7de47ef-cc7c-4c89-9d34-9147b8a5bec6 @@ -0,0 +1 @@ +https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'Explain how AI works in a few words' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works in a few words" } ] } ] }' "Thinking" is on by default on many of our code samples Many code samples on this site use the Gemini 2.5 Flash model, which has the "thinking" feature enabled by default to enhance response quality. You should be aware that this may increase response time and token usage. If you prioritize speed or wish to minimize costs, you can disable this feature by setting the thinking budget to zero, as shown in the examples below. For more details, see the thinking guide . Note: Thinking is only available on Gemini 2.5 series models and can't be disabled on Gemini 2.5 Pro. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Explain how AI works in a few words" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # Disables thinking ), ) \ No newline at end of file diff --git a/docstore/d8111e03-9141-4be2-a3ed-8c9c0b62b69f b/docstore/d8111e03-9141-4be2-a3ed-8c9c0b62b69f new file mode 100644 index 0000000000000000000000000000000000000000..39af00a0c9a9d748f5b17ca6db1ebff5b823541e --- /dev/null +++ b/docstore/d8111e03-9141-4be2-a3ed-8c9c0b62b69f @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " Delete uploaded files Files are automatically deleted after 48 hours. You can also manually delete an uploaded file: Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) client . files . delete ( name = myfile . name ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; await ai . files . delete ({ name : fileName }); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } client . DeleteFile ( ctx , file . Name ) REST curl --request "DELETE" https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " Usage info You can use the Files API to upload and interact with media files. The Files API lets you store up to 20 GB of files per project, with a per-file maximum size of 2 GB. Files are stored for 48 hours. During that time, you can use the API to get metadata about the files, but you can't download the files. The Files API is available at no cost in all regions where the Gemini API is available. File prompting strategies This section provides guidance and best practices for using media files with prompts for the Gemini API. Being able to use various types of data in your prompts gives you more flexibility in terms of what tasks you can tackle with the Gemini API. For example, you can send the model a photo of a delicious meal and ask it to write a short blog about the meal. Prompt Response Write a short, engaging blog post based on this picture. It should include a description of the meal in the photo and talk about my journey meal prepping. Meal prepping is a great way to save time and money, and it can also help you to eat healthier. This meal is a great example of a healthy and delicious meal that can be easily prepped ahead of time. This \ No newline at end of file diff --git a/docstore/d81d708e-c296-4dde-bfa6-27f60ab664cc b/docstore/d81d708e-c296-4dde-bfa6-27f60ab664cc new file mode 100644 index 0000000000000000000000000000000000000000..846f589921f766089772715bc1a3853935a191ce --- /dev/null +++ b/docstore/d81d708e-c296-4dde-bfa6-27f60ab664cc @@ -0,0 +1 @@ +batch_status.json ) if [[ $batch_state = "JOB_STATE_SUCCEEDED" ]] ; then if [[ $( jq '.response | has("inlinedResponses")' batch_status.json ) = "true" ]] ; then jq -r '.response.inlinedResponses' batch_status.json exit fi responses_file_name = $( jq -r '.response.responsesFile' batch_status.json ) curl https://generativelanguage.googleapis.com/download/v1beta/ $responses_file_name :download?alt = media \ -H "x-goog-api-key: $GEMINI_API_KEY " 2 > /dev/null elif [[ $batch_state = "JOB_STATE_FAILED" ]] ; then jq '.error' batch_status.json elif [[ $batch_state == "JOB_STATE_CANCELLED" ]] ; then echo "Batch was cancelled by the user" fi Cancelling a batch job You can cancel an ongoing batch job using its name. When a job is canceled, it stops processing new requests. Python # Cancel a batch job client . batches . cancel ( name = batch_job_to_cancel . name ) REST BATCH_NAME = "batches/123456" # Your batch job name # Cancel the batch curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME :cancel \ -H "x-goog-api-key: $GEMINI_API_KEY " \ # Confirm that the status of the batch after cancellation is JOB_STATE_CANCELLED curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type:application/json" 2 > /dev/null | jq -r '.metadata.state' Deleting a batch job You can delete an existing batch job using its name. When a job is deleted, it stops processing new requests and is removed from the list of batch jobs. Python # Delete a batch job client . batches . delete ( name = batch_job_to_delete . name ) REST BATCH_NAME = "batches/123456" # Your batch job name # Cancel the batch curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME :delete \ -H "x-goog-api-key: $GEMINI_API_KEY " \ Technical details Supported models: Batch Mode supports a range of Gemini models. Refer to the Models page for the latest list of compatible models. The supported modalities for Batch Mode are the same as what's \ No newline at end of file diff --git a/docstore/d822c2f1-4dfb-4a62-b3ef-92b7a7003325 b/docstore/d822c2f1-4dfb-4a62-b3ef-92b7a7003325 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/d822c2f1-4dfb-4a62-b3ef-92b7a7003325 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/d857790e-ba04-4d65-a1a3-7721f5af2fc0 b/docstore/d857790e-ba04-4d65-a1a3-7721f5af2fc0 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/d857790e-ba04-4d65-a1a3-7721f5af2fc0 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/d866add2-06a4-48b4-b195-1748e9757e39 b/docstore/d866add2-06a4-48b4-b195-1748e9757e39 new file mode 100644 index 0000000000000000000000000000000000000000..facd4f50718f089582ed06a7b9e2ea144ce56d9c --- /dev/null +++ b/docstore/d866add2-06a4-48b4-b195-1748e9757e39 @@ -0,0 +1 @@ +instructions, such as: "Show bounding boxes of all green objects in this image". It also support custom labels like "label the items with the allergens they can contain". For more examples, check following notebooks in the Gemini Cookbook : 2D spatial understanding notebook Experimental 3D pointing notebook Segmentation Starting with Gemini 2.5, models not only detect items but also segment them and provide their contour masks. The model predicts a JSON list, where each item represents a segmentation mask. Each item has a bounding box (" box_2d ") in the format [y0, x0, y1, x1] with normalized coordinates between 0 and 1000, a label (" label ") that identifies the object, and finally the segmentation mask inside the bounding box, as base64 encoded png that is a probability map with values between 0 and 255. The mask needs to be resized to match the bounding box dimensions, then binarized at your confidence threshold (127 for the midpoint). Note: For better results, disable thinking by setting the thinking budget to 0. See code sample below for an example. Python from google import genai from google.genai import types from PIL import Image , ImageDraw import io import base64 import json import numpy as np import os client = genai . Client () def parse_json ( json_output : str ): # Parsing out the markdown fencing lines = json_output . splitlines () for i , line in enumerate ( lines ): if line == "```json" : json_output = " \n " . join ( lines [ i + 1 :]) # Remove everything before "```json" output = json_output . split ( "```" )[ 0 ] # Remove everything after the closing "```" break # Exit the loop once "```json" is found return json_output def extract_segmentation_masks ( image_path : str , output_dir : str = "segmentation_outputs" ): # Load and resize image im = Image . open ( image_path ) im . thumbnail ([ 1024 , 1024 ], Image . Resampling . LANCZOS ) prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks \ No newline at end of file diff --git a/docstore/d86832b2-ba9b-41cc-adbc-1a74102081af b/docstore/d86832b2-ba9b-41cc-adbc-1a74102081af new file mode 100644 index 0000000000000000000000000000000000000000..837870a459bf3af151e2e94da3e76d8940df1169 --- /dev/null +++ b/docstore/d86832b2-ba9b-41cc-adbc-1a74102081af @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/live-session#context-window-compression Title: Session management with Live API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/d86a5fcd-a97d-45ff-8212-6c2639ad6aa0 b/docstore/d86a5fcd-a97d-45ff-8212-6c2639ad6aa0 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/d86a5fcd-a97d-45ff-8212-6c2639ad6aa0 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/d86bd346-670a-4ee9-9237-3100db6c971d b/docstore/d86bd346-670a-4ee9-9237-3100db6c971d new file mode 100644 index 0000000000000000000000000000000000000000..4403c8e8ebca16251f4875b8e14907f4412efbd1 --- /dev/null +++ b/docstore/d86bd346-670a-4ee9-9237-3100db6c971d @@ -0,0 +1 @@ +"role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Hello!" } ], stream = True ) for chunk in response : print ( chunk . choices [ 0 ] . delta ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function main () { const completion = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : [ { "role" : "system" , "content" : "You are a helpful assistant." }, { "role" : "user" , "content" : "Hello!" } ], stream : true , }); for await ( const chunk of completion ) { console . log ( chunk . choices [ 0 ]. delta . content ); } } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.0-flash", "messages": [ {"role": "user", "content": "Explain to me how AI works"} ], "stream": true }' Function calling Function calling makes it easier for you to get structured data outputs from generative models and is supported in the Gemini API . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) tools = [ { "type" : "function" , "function" : { "name" : "get_weather" , "description" : "Get the weather in a given location" , "parameters" : { "type" : "object" , "properties" : { "location" : { "type" : "string" , "description" : "The city and state, e.g. Chicago, IL" , }, "unit" : { "type" : "string" , "enum" : [ "celsius" , "fahrenheit" ]}, }, "required" : [ "location" ], }, } } ] messages = [{ "role" : "user" , "content" : "What's the weather like in Chicago today?" }] response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = messages , tools = tools , tool_choice = "auto" ) print ( response ) JavaScript import \ No newline at end of file diff --git a/docstore/d87ff908-baa1-44b0-9d56-a311c28f2239 b/docstore/d87ff908-baa1-44b0-9d56-a311c28f2239 new file mode 100644 index 0000000000000000000000000000000000000000..750217269b5a3e53dd2d92de822b68418a6799df --- /dev/null +++ b/docstore/d87ff908-baa1-44b0-9d56-a311c28f2239 @@ -0,0 +1 @@ +' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Get metadata for a file You can verify that the API successfully stored the uploaded file and get its metadata by calling files.get . Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) file_name = myfile . name myfile = client . files . get ( name = file_name ) print ( myfile ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; const fetchedFile = await ai . files . get ({ name : fileName }); console . log ( fetchedFile ); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } gotFile , err := client . GetFile ( ctx , file . Name ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Got file:" , gotFile . Name ) REST # file_info.json was created in the upload example name = $( jq ".file.name" file_info.json ) # Get the file of interest to check state curl https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " > file_info.json # Print some information about the file you got name = $( jq ".file.name" file_info.json ) echo name = $name file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri List uploaded files You can upload multiple files using the Files API. The following code gets a list of all the files uploaded: Python print ( 'My files:' ) for f in client . files . list (): print ( ' ' , f . name ) JavaScript const listResponse = await ai . files . list ({ config : { pageSize : 10 } }); for await ( const file of listResponse ) { console . log ( file . name ); } Go iter := client . ListFiles ( ctx ) for { ifile , err := iter . Next () if err == iterator . Done { break } if err != nil { log . Fatal ( err ) } fmt . Println ( ifile . Name ) } REST echo "My files: " curl \ No newline at end of file diff --git a/docstore/d88e0653-68ca-4e9f-89bb-f4222463bf94 b/docstore/d88e0653-68ca-4e9f-89bb-f4222463bf94 new file mode 100644 index 0000000000000000000000000000000000000000..e8472db3f1b37b3dc2d13dd06406d0e42fc75dfb --- /dev/null +++ b/docstore/d88e0653-68ca-4e9f-89bb-f4222463bf94 @@ -0,0 +1 @@ +costing 258 tokens. Tips and best practices Verify that images are correctly rotated. Use clear, non-blurry images. When using a single image with text, place the text prompt after the image part in the contents array. What's next This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources: Files API : Learn more about uploading and managing files for use with Gemini. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/d8ac3a04-c02d-4dcc-9b67-77cca1a3c602 b/docstore/d8ac3a04-c02d-4dcc-9b67-77cca1a3c602 new file mode 100644 index 0000000000000000000000000000000000000000..facd4f50718f089582ed06a7b9e2ea144ce56d9c --- /dev/null +++ b/docstore/d8ac3a04-c02d-4dcc-9b67-77cca1a3c602 @@ -0,0 +1 @@ +instructions, such as: "Show bounding boxes of all green objects in this image". It also support custom labels like "label the items with the allergens they can contain". For more examples, check following notebooks in the Gemini Cookbook : 2D spatial understanding notebook Experimental 3D pointing notebook Segmentation Starting with Gemini 2.5, models not only detect items but also segment them and provide their contour masks. The model predicts a JSON list, where each item represents a segmentation mask. Each item has a bounding box (" box_2d ") in the format [y0, x0, y1, x1] with normalized coordinates between 0 and 1000, a label (" label ") that identifies the object, and finally the segmentation mask inside the bounding box, as base64 encoded png that is a probability map with values between 0 and 255. The mask needs to be resized to match the bounding box dimensions, then binarized at your confidence threshold (127 for the midpoint). Note: For better results, disable thinking by setting the thinking budget to 0. See code sample below for an example. Python from google import genai from google.genai import types from PIL import Image , ImageDraw import io import base64 import json import numpy as np import os client = genai . Client () def parse_json ( json_output : str ): # Parsing out the markdown fencing lines = json_output . splitlines () for i , line in enumerate ( lines ): if line == "```json" : json_output = " \n " . join ( lines [ i + 1 :]) # Remove everything before "```json" output = json_output . split ( "```" )[ 0 ] # Remove everything after the closing "```" break # Exit the loop once "```json" is found return json_output def extract_segmentation_masks ( image_path : str , output_dir : str = "segmentation_outputs" ): # Load and resize image im = Image . open ( image_path ) im . thumbnail ([ 1024 , 1024 ], Image . Resampling . LANCZOS ) prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks \ No newline at end of file diff --git a/docstore/d8b70ff9-fd50-4400-af0c-80168d0d03fc b/docstore/d8b70ff9-fd50-4400-af0c-80168d0d03fc new file mode 100644 index 0000000000000000000000000000000000000000..72fc1af4a198f3f6ddd1bd2823f7a9c0e142bbbb --- /dev/null +++ b/docstore/d8b70ff9-fd50-4400-af0c-80168d0d03fc @@ -0,0 +1 @@ +should be sent to flush any cached audio. The client can resume sending audio data at any time. Python # example audio file to try: # URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" # !wget -q $URL -O sample.pcm import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_bytes = Path ( "sample.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) # if stream gets paused, send: # await session.send_realtime_input(audio_stream_end=True) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // example audio file to try: // URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" // !wget -q $URL -O sample.pcm import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . \ No newline at end of file diff --git a/docstore/d8c42ce9-4253-4874-a9a0-fc31fc670ff2 b/docstore/d8c42ce9-4253-4874-a9a0-fc31fc670ff2 new file mode 100644 index 0000000000000000000000000000000000000000..c25aa08b43110ad03ba57bdce48865495fdfb941 --- /dev/null +++ b/docstore/d8c42ce9-4253-4874-a9a0-fc31fc670ff2 @@ -0,0 +1 @@ +fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ], "generationConfig": { "thinkingConfig": { "thinkingBudget": 0 } } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } System instructions and other configurations You can guide the behavior of Gemini models with system instructions. To do so, pass a GenerateContentConfig object. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , config = types . GenerateContentConfig ( system_instruction = "You are a cat. Your name is Neko." ), contents = "Hello there" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Hello there" , config : { systemInstruction : "You are a cat. Your name is Neko." , }, }); console . log ( response . text ); } await main (); \ No newline at end of file diff --git a/docstore/d8c95690-a485-49e8-832f-3561140dfbf4 b/docstore/d8c95690-a485-49e8-832f-3561140dfbf4 new file mode 100644 index 0000000000000000000000000000000000000000..aef01da97801860cabcd3fb68af1ef57ccf11af0 --- /dev/null +++ b/docstore/d8c95690-a485-49e8-832f-3561140dfbf4 @@ -0,0 +1 @@ +Speech generation (text-to-speech) | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Speech generation (text-to-speech) The Gemini API can transform text input into single speaker or multi-speaker audio using native text-to-speech (TTS) generation capabilities. Text-to-speech (TTS) generation is controllable , meaning you can use natural language to structure interactions and guide the style , accent , pace , and tone of the audio. The TTS capability differs from speech generation provided through the Live API , which is designed for interactive, unstructured audio, and multimodal inputs and outputs. While the Live API excels in dynamic conversational contexts, TTS through the Gemini API is tailored for scenarios that require exact text recitation with fine-grained control over style and sound, such as podcast or audiobook generation. This guide shows you how to generate single-speaker and multi-speaker audio from text. Preview: Native text-to-speech (TTS) is in Preview . Before you begin Ensure you use a Gemini 2.5 model variant with native text-to-speech (TTS) capabilities, as listed in the Supported models section. For optimal results, consider which model best fits your specific use case. You may find it useful to test the Gemini 2.5 TTS models in AI Studio before you start building. Note: TTS models accept text-only inputs and produce audio-only outputs. For a complete list of restrictions specific to TTS models, review the Limitations section. Single-speaker text-to-speech To convert text to single-speaker audio, set the response modality to "audio", and pass a SpeechConfig object with VoiceConfig set. You'll need to choose a \ No newline at end of file diff --git a/docstore/d8cba102-3e05-4181-84ae-d8ef428b543c b/docstore/d8cba102-3e05-4181-84ae-d8ef428b543c new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/d8cba102-3e05-4181-84ae-d8ef428b543c @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/d8e8d597-7de9-43d9-9942-269ad2fef21f b/docstore/d8e8d597-7de9-43d9-9942-269ad2fef21f new file mode 100644 index 0000000000000000000000000000000000000000..4a588ba82271aaa08f2df12295049c23bd57f7eb --- /dev/null +++ b/docstore/d8e8d597-7de9-43d9-9942-269ad2fef21f @@ -0,0 +1 @@ +, ) print ( response . text ) # The SDK handles the function call and returns the final text You can disable automatic function calling with: Python config = types . GenerateContentConfig ( tools = [ get_current_temperature ], automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ) ) Automatic function schema declaration Automatic schema extraction from Python functions doesn't work in all cases. For example, it doesn't handle cases where you describe the fields of a nested dictionary-object. The API is able to describe any of the following types: Python AllowedType = ( int | float | bool | str | list [ 'AllowedType' ] | dict [ str , AllowedType ]) To see what the inferred schema looks like, you can convert it using from_callable : Python def multiply ( a : float , b : float ): """Returns a * b.""" return a * b fn_decl = types . FunctionDeclaration . from_callable ( callable = multiply , client = client ) # to_json_dict() provides a clean JSON representation. print ( fn_decl . to_json_dict ()) Multi-tool use: Combine native tools with function calling You can enable multiple tools combining native tools with function calling at the same time. Here's an example that enables two tools, Grounding with Google Search and code execution , in a request using the Live API . Note: Multi-tool use is a- Live API only feature at the moment. The run() function declaration, which handles the asynchronous websocket setup, is omitted for brevity. Python # Multiple tasks example - combining lights, code execution, and search prompt = """ Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! """ tools = [ { 'google_search' : {}}, { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} # not defined here. \ No newline at end of file diff --git a/docstore/d8f6764d-19e0-4d3a-8ff3-6eb6494c3a5c b/docstore/d8f6764d-19e0-4d3a-8ff3-6eb6494c3a5c new file mode 100644 index 0000000000000000000000000000000000000000..46b1ab716068a90ca8b9aaaffe42e5334bcea2c0 --- /dev/null +++ b/docstore/d8f6764d-19e0-4d3a-8ff3-6eb6494c3a5c @@ -0,0 +1 @@ +Batch Mode | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Batch Mode The Gemini API's Batch Mode is designed to process large volumes of requests asynchronously at 50% of the standard cost . The target turnaround time is 24 hours, but in majority of cases, it is much quicker. Use Batch Mode for large-scale, non-urgent tasks such as data pre-processing or running evaluations where an immediate response is not required. Note: You can use Batch Mode with the Gemini API Python SDK or the REST API. Support for Batch Mode in the Gemini API JavaScript SDK is coming soon. Getting Started This section helps you get started with submitting your first requests in batch mode. Creating a batch job You have two ways to submit your requests in Batch Mode: Inline Requests : A list of GenerateContentRequest objects directly included in your batch creation request. This is suitable for smaller batches that keep the total request size under 20MB. The output returned from the model is a list of inlineResponse objects. Input File : A JSON Lines (JSONL) file where each line contains a complete GenerateContentRequest object. This method is recommended for larger requests. The output returned from the model is a JSONL file where each line is either a GenerateContentResponse or a status object. Inline requests For a small number of requests, you can directly embed the GenerateContentRequest objects within your BatchGenerateContentRequest . The following example calls the BatchGenerateContent method with inline requests: Python from google import genai from google.genai import types client = genai . Client () # A list of dictionaries, where each is a \ No newline at end of file diff --git a/docstore/d8fc45b6-286e-4bf7-8f11-b82906a922e6 b/docstore/d8fc45b6-286e-4bf7-8f11-b82906a922e6 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/d8fc45b6-286e-4bf7-8f11-b82906a922e6 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/d91ff65d-fb01-4559-bcba-162bb92966d9 b/docstore/d91ff65d-fb01-4559-bcba-162bb92966d9 new file mode 100644 index 0000000000000000000000000000000000000000..6479a4b50897c899a1b9742e0d69348c2776f1d5 --- /dev/null +++ b/docstore/d91ff65d-fb01-4559-bcba-162bb92966d9 @@ -0,0 +1 @@ +config = types . GenerateImagesConfig ( aspect_ratio = "16:9" , number_of_images = 1 ) ) imagen . generated_images [ 0 ] . image JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const response = await ai . models . generateImages ({ model : "imagen-3.0-generate-002" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , config : { numberOfImages : 1 , }, }); // you'll pass response.generatedImages[0].image.imageBytes to Veo Go package main import ( "context" "fmt" "os" "time" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateImagesConfig { AspectRatio : "16:9" , NumberOfImages : 1 , } response , _ := client . Models . GenerateImages ( ctx , "imagen-3.0-generate-002" , "Panning wide shot of a calico kitten sleeping in the sunshine" , config , ) // you'll pass response.GeneratedImages[0].Image to Veo } Then, generate a video using the resulting image as the first frame: Python operation = client . models . generate_videos ( model = "veo-2.0-generate-001" , prompt = prompt , image = imagen . generated_images [ 0 ] . image , config = types . GenerateVideosConfig ( person_generation = "dont_allow" , # "dont_allow" or "allow_adult" aspect_ratio = "16:9" , # "16:9" or "9:16" number_of_videos = 2 ), ) # Wait for videos to generate while not operation . done : time . sleep ( 20 ) operation = client . operations . get ( operation ) for n , video in enumerate ( operation . response . generated_videos ): fname = f 'with_image_input { n } .mp4' print ( fname ) client . files . download ( file = video . video ) video . video . save ( fname ) JavaScript import { GoogleGenAI } from "@google/genai" ; import { createWriteStream } from "fs" ; import { Readable } from "stream" ; const ai = new GoogleGenAI ({}); async function main () { // get image bytes from Imagen, as shown above let \ No newline at end of file diff --git a/docstore/d966a117-862f-4a2c-98b9-f782950a3666 b/docstore/d966a117-862f-4a2c-98b9-f782950a3666 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/d966a117-862f-4a2c-98b9-f782950a3666 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/d96f9aa5-4cfd-470b-8cae-b4998550fe48 b/docstore/d96f9aa5-4cfd-470b-8cae-b4998550fe48 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/d96f9aa5-4cfd-470b-8cae-b4998550fe48 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/d979376d-5761-4efc-905d-d0e9eda4556d b/docstore/d979376d-5761-4efc-905d-d0e9eda4556d new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/d979376d-5761-4efc-905d-d0e9eda4556d @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/d97b0eb2-73e5-4984-9d6d-46ac642c0b86 b/docstore/d97b0eb2-73e5-4984-9d6d-46ac642c0b86 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/d97b0eb2-73e5-4984-9d6d-46ac642c0b86 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/d97e264f-1383-4f29-a362-5ee5236ad124 b/docstore/d97e264f-1383-4f29-a362-5ee5236ad124 new file mode 100644 index 0000000000000000000000000000000000000000..acef50f8402c486c348013efe146de15b88cb32b --- /dev/null +++ b/docstore/d97e264f-1383-4f29-a362-5ee5236ad124 @@ -0,0 +1 @@ +Function calling with the Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Function calling with the Gemini API Function calling lets you connect models to external tools and APIs. Instead of generating text responses, the model determines when to call specific functions and provides the necessary parameters to execute real-world actions. This allows the model to act as a bridge between natural language and real-world actions and data. Function calling has 3 primary use cases: Augment Knowledge: Access information from external sources like databases, APIs, and knowledge bases. Extend Capabilities: Use external tools to perform computations and extend the limitations of the model, such as using a calculator or creating charts. Take Actions: Interact with external systems using APIs, such as scheduling appointments, creating invoices, sending emails, or controlling smart home devices. Get Weather Schedule Meeting Create Chart How function calling works Function calling involves a structured interaction between your application, the model, and external functions. Here's a breakdown of the process: Define Function Declaration: Define the function declaration in your application code. Function Declarations describe the function's name, parameters, and purpose to the model. Call LLM with function declarations: Send user prompt along with the function declaration(s) to the model. It analyzes the request and determines if a function call would be helpful. If so, it responds with a structured JSON object. Execute Function Code (Your Responsibility): The Model does not execute the function itself. It's your application's responsibility to \ No newline at end of file diff --git a/docstore/d9ac7730-e3c5-4b68-81dd-7223828e6561 b/docstore/d9ac7730-e3c5-4b68-81dd-7223828e6561 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/d9ac7730-e3c5-4b68-81dd-7223828e6561 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/d9b9d3a5-086f-469a-ba5b-04ac23e351b4 b/docstore/d9b9d3a5-086f-469a-ba5b-04ac23e351b4 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/d9b9d3a5-086f-469a-ba5b-04ac23e351b4 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/d9cc4448-c149-47f1-ab2f-8d3036c7cdda b/docstore/d9cc4448-c149-47f1-ab2f-8d3036c7cdda new file mode 100644 index 0000000000000000000000000000000000000000..9a3ae8e54d036eb9d08cf51953b4e3479c03ffae --- /dev/null +++ b/docstore/d9cc4448-c149-47f1-ab2f-8d3036c7cdda @@ -0,0 +1 @@ +Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/da03155c-596c-4763-b187-2e7456b0f849 b/docstore/da03155c-596c-4763-b187-2e7456b0f849 new file mode 100644 index 0000000000000000000000000000000000000000..ec6cba9f5d0ceb3b74c56797939372d30da827c9 --- /dev/null +++ b/docstore/da03155c-596c-4763-b187-2e7456b0f849 @@ -0,0 +1 @@ += "gemini-2.0-flash" , contents = """Generate a short transcript around 100 words that reads like it was clipped from a podcast by excited herpetologists. The hosts names are Dr. Anya and Liam.""" ) . text response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = transcript , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( multi_speaker_voice_config = types . MultiSpeakerVoiceConfig ( speaker_voice_configs = [ types . SpeakerVoiceConfig ( speaker = 'Dr. Anya' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), types . SpeakerVoiceConfig ( speaker = 'Liam' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Puck' , ) ) ), ] ) ) ) ) # ...Code to stream or save the output JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const transcript = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Generate a short transcript around 100 words that reads like it was clipped from a podcast by excited herpetologists. The hosts names are Dr. Anya and Liam." , }) const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : transcript , config : { responseModalities : [ 'AUDIO' ], speechConfig : { multiSpeakerVoiceConfig : { speakerVoiceConfigs : [ { speaker : "Dr. Anya" , voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" }, } }, { speaker : "Liam" , voiceConfig : { prebuiltVoiceConfig : { voiceName : "Puck" }, } } ] } } } }); } // ..JavaScript code for exporting .wav file for output audio await main (); Voice options TTS models support the following 30 voice options in the voice_name field: Zephyr -- Bright Puck -- Upbeat Charon -- Informative Kore -- Firm Fenrir -- Excitable Leda -- Youthful Orus -- Firm \ No newline at end of file diff --git a/docstore/da14d0e5-de4d-47bf-acc5-3d7866563d9c b/docstore/da14d0e5-de4d-47bf-acc5-3d7866563d9c new file mode 100644 index 0000000000000000000000000000000000000000..4d2e24ea9938e140050ecec90b7146205f15036b --- /dev/null +++ b/docstore/da14d0e5-de4d-47bf-acc5-3d7866563d9c @@ -0,0 +1 @@ +Embeddings | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Embeddings Note: Introducing our first Gemini embedding model, available now to developers as gemini-embedding-exp-03-07 in the API. The Gemini API supports several embedding models that generate embeddings for words, phrases, code, and sentences. The resulting embeddings can then be used for tasks such as semantic search, text classification, and clustering, among many others. What are embeddings? Embeddings are numerical representations of text (or other media formats) that capture relationships between inputs. Text embeddings work by converting text into arrays of floating point numbers, called vectors . These vectors are designed to capture the meaning of the text. The length of the embedding array is called the vector's dimensionality . A passage of text might be represented by a vector containing hundreds of dimensions. Embeddings capture semantic meaning and context, which results in text with similar meanings having "closer" embeddings. For example, the sentence "I took my dog to the vet" and "I took my cat to the vet" would have embeddings that are close to each other in the vector space. You can use embeddings to compare different texts and understand how they relate. For example, if the embeddings of the text "cat" and "dog" are close together you can infer that these words are similar in meaning, context, or both. This enables a variety of common AI use cases . Before you begin Before calling the Gemini API, ensure you have your SDK of choice installed, and a Gemini API key configured and ready to use. Generate embeddings Use the embedContent method to generate \ No newline at end of file diff --git a/docstore/da3afeb1-3dea-46cf-971d-ad82e181d1ec b/docstore/da3afeb1-3dea-46cf-971d-ad82e181d1ec new file mode 100644 index 0000000000000000000000000000000000000000..ea67dfb950f5bf1f1ddc040da181eb1accb5974b --- /dev/null +++ b/docstore/da3afeb1-3dea-46cf-971d-ad82e181d1ec @@ -0,0 +1 @@ +breakdown:" ) for detail in usage . response_tokens_details : match detail : case types . ModalityTokenCount ( modality = modality , token_count = count ): print ( f " { modality } : { count } " ) JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . usageMetadata ) { console . debug ( 'Used %s tokens in total. Response token breakdown:\n' , turn . usageMetadata . totalTokenCount ); for ( const detail of turn . usageMetadata . responseTokensDetails ) { console . debug ( '%s\n' , detail ); } } } Media resolution You can specify the media resolution for the input media by setting the mediaResolution field as part of the session configuration: Python from google.genai import types config = { "response_modalities" : [ "AUDIO" ], "media_resolution" : types . MediaResolution . MEDIA_RESOLUTION_LOW , } JavaScript import { GoogleGenAI , Modality , MediaResolution } from '@google/genai' ; const config = { responseModalities : [ Modality . TEXT ], mediaResolution : MediaResolution . MEDIA_RESOLUTION_LOW , }; Limitations Consider the following limitations of the Live API when you plan your project. Response modalities You can only set one response modality ( TEXT or AUDIO ) per session in the session configuration. Setting both results in a config error message. This means that you can configure the model to respond with either text or audio, but not both in the same session. Client authentication The Live API only provides server-to-server authentication by default. If you're implementing your Live API application using a client-to-server approach , you need to use ephemeral tokens to mitigate security risks. Session duration Audio-only sessions are limited to 15 minutes, and audio plus video sessions are limited to 2 minutes. However, you can configure different session management techniques for unlimited extensions on session duration. Context window A session has a context window limit of: 128k tokens for native audio output models 32k \ No newline at end of file diff --git a/docstore/da4cef58-9d94-4548-a835-bb0dd1ac5695 b/docstore/da4cef58-9d94-4548-a835-bb0dd1ac5695 new file mode 100644 index 0000000000000000000000000000000000000000..bdac38d149644927b7d126aee3930efd52a696eb --- /dev/null +++ b/docstore/da4cef58-9d94-4548-a835-bb0dd1ac5695 @@ -0,0 +1 @@ +turn_off_the_lights_schema = { 'name' : 'turn_off_the_lights' } prompt = """ Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? """ tools = [ { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} ] await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Light control schemas const turnOnTheLightsSchema = { name : 'turn_on_the_lights' }; const turnOffTheLightsSchema = { name : 'turn_off_the_lights' }; const prompt = ` Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? ` ; const tools = [ { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } ]; await run ( prompt , tools = tools , modality = "AUDIO" ) Function calling modes The Gemini API lets you control how the model uses the provided tools (function declarations). Specifically, you can set the mode within the. function_calling_config . AUTO (Default) : The model decides whether to generate a natural language response or suggest a function call based on the prompt and context. This is the most flexible mode and recommended for most scenarios. ANY : The model is constrained to always predict a function call and guarantees function schema adherence. If allowed_function_names is not specified, the model can choose from any of the provided function declarations. If allowed_function_names is provided as a list, the model can only choose from the functions in that list. Use this mode when you require a function call response to every prompt (if applicable). NONE : The model is prohibited from making function calls. This is equivalent to sending a request without any function declarations. Use this to temporarily disable function calling without removing your tool definitions. Python from google.genai import types # Configure function calling mode tool_config = types . ToolConfig ( \ No newline at end of file diff --git a/docstore/da54e47e-89e0-4a72-af5b-f3123ef40ef0 b/docstore/da54e47e-89e0-4a72-af5b-f3123ef40ef0 new file mode 100644 index 0000000000000000000000000000000000000000..58a0a85d29a664bbdbdb8d3a93c58460ef8ef554 --- /dev/null +++ b/docstore/da54e47e-89e0-4a72-af5b-f3123ef40ef0 @@ -0,0 +1 @@ += []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); And here is a text-to-audio example. You can receive audio by setting AUDIO as response modality. This example saves the received data as WAV file: \ No newline at end of file diff --git a/docstore/da6c56cc-4a0a-4bd1-8489-b56abc65b4b9 b/docstore/da6c56cc-4a0a-4bd1-8489-b56abc65b4b9 new file mode 100644 index 0000000000000000000000000000000000000000..7c3d98af9909034f92832ab4dcc3a7220e5c9856 --- /dev/null +++ b/docstore/da6c56cc-4a0a-4bd1-8489-b56abc65b4b9 @@ -0,0 +1 @@ +operation = await ai . models . generateVideos ({ model : "veo-2.0-generate-001" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , image : { imageBytes : response . generatedImages [ 0 ]. image . imageBytes , // response from Imagen mimeType : "image/png" , }, config : { aspectRatio : "16:9" , numberOfVideos : 2 , }, }); while ( ! operation . done ) { await new Promise (( resolve ) = > setTimeout ( resolve , 10000 )); operation = await ai . operations . getVideosOperation ({ operation : operation , }); } operation . response ? . generatedVideos ? . forEach ( async ( generatedVideo , n ) = > { const resp = await fetch ( ` ${ generatedVideo . video ? . uri } & key=GEMINI_API_KEY` , // append your API key ); const writer = createWriteStream ( `video ${ n } .mp4` ); Readable . fromWeb ( resp . body ). pipe ( writer ); }); } main (); Go image := response . GeneratedImages [ 0 ]. Image videoConfig := & genai . GenerateVideosConfig { AspectRatio : "16:9" , NumberOfVideos : 2 , } operation , _ := client . Models . GenerateVideos ( ctx , "veo-2.0-generate-001" , "A dramatic scene based on the input image" , image , videoConfig , ) for ! operation . Done { time . Sleep ( 20 * time . Second ) operation , _ = client . Operations . GetVideosOperation ( ctx , operation , nil ) } for n , video := range operation . Response . GeneratedVideos { client . Files . Download ( ctx , video . Video , nil ) fname := fmt . Sprintf ( "video_with_image_input_%d.mp4" , n ) _ = os . WriteFile ( fname , video . Video . VideoBytes , 0644 ) } Veo model parameters (Naming conventions vary by programming language.) prompt : The text prompt for the video. When present, the image parameter is optional. image : The image to use as the first frame for the video. When present, the prompt parameter is optional. negativePrompt : Text string that describes anything you want to discourage the model from generating aspectRatio : Changes the aspect ratio of the generated video. \ No newline at end of file diff --git a/docstore/da70bf0b-920b-4b13-b798-821de02a62aa b/docstore/da70bf0b-920b-4b13-b798-821de02a62aa new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/da70bf0b-920b-4b13-b798-821de02a62aa @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/da86a640-e788-4377-a40f-bdf7b1d1257b b/docstore/da86a640-e788-4377-a40f-bdf7b1d1257b new file mode 100644 index 0000000000000000000000000000000000000000..6d0fb3dda30537d366b10ce141412a20984aa796 --- /dev/null +++ b/docstore/da86a640-e788-4377-a40f-bdf7b1d1257b @@ -0,0 +1 @@ +where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels. """ config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # set thinking_budget to 0 for better results in object detection ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ prompt , im ], # Pillow images can be directly passed as inputs (which will be converted by the SDK) config = config ) # Parse JSON response items = json . loads ( parse_json ( response . text )) # Create output directory os . makedirs ( output_dir , exist_ok = True ) # Process each mask for i , item in enumerate ( items ): # Get bounding box coordinates box = item [ "box_2d" ] y0 = int ( box [ 0 ] / 1000 * im . size [ 1 ]) x0 = int ( box [ 1 ] / 1000 * im . size [ 0 ]) y1 = int ( box [ 2 ] / 1000 * im . size [ 1 ]) x1 = int ( box [ 3 ] / 1000 * im . size [ 0 ]) # Skip invalid boxes if y0 > = y1 or x0 > = x1 : continue # Process mask png_str = item [ "mask" ] if not png_str . startswith ( "data:image/png;base64," ): continue # Remove prefix png_str = png_str . removeprefix ( "data:image/png;base64," ) mask_data = base64 . b64decode ( png_str ) mask = Image . open ( io . BytesIO ( mask_data )) # Resize mask to match bounding box mask = mask . resize (( x1 - x0 , y1 - y0 ), Image . Resampling . BILINEAR ) # Convert mask to numpy array for processing mask_array = np . array ( mask ) # Create overlay for this mask overlay = Image . new ( 'RGBA' , im . size , ( 0 , 0 , 0 , 0 )) overlay_draw = ImageDraw . Draw ( overlay ) # Create overlay for the mask color = ( 255 , 255 , 255 , 200 ) for y in range ( y0 , y1 ): for x in range ( x0 , x1 ): if mask_array [ y - y0 , x - x0 ] > 128 : # Threshold for mask overlay_draw . point (( x , y ), fill = color ) # Save individual mask and its overlay mask_filename = f " { item [ 'label' ] } _ { i } _mask.png" \ No newline at end of file diff --git a/docstore/da8c73f7-828b-4c47-b70c-a0acd55d26d9 b/docstore/da8c73f7-828b-4c47-b70c-a0acd55d26d9 new file mode 100644 index 0000000000000000000000000000000000000000..54ee11bfb756db29fb776eb5a6d4247407dfa205 --- /dev/null +++ b/docstore/da8c73f7-828b-4c47-b70c-a0acd55d26d9 @@ -0,0 +1 @@ +(Default/Some Thinking): Many common requests benefit from a degree of step-by-step processing or deeper understanding. Gemini can flexibly use thinking capability for tasks like: Analogize photosynthesis and growing up. Compare and contrast electric cars and hybrid cars. Hard Tasks (Maximum Thinking Capability): For truly complex challenges, such as solving complex math problems or coding tasks, we recommend setting a high thinking budget. These types of tasks require the model to engage its full reasoning and planning capabilities, often involving many internal steps before providing an answer. Examples include: Solve problem 1 in AIME 2025: Find the sum of all integer bases b > 9 for which 17 b is a divisor of 97 b . Write Python code for a web application that visualizes real-time stock market data, including user authentication. Make it as efficient as possible. Thinking with tools and capabilities Thinking models work with all of Gemini's tools and capabilities. This allows the models to interact with external systems, execute code, or access real-time information, incorporating the results into their reasoning and final response. The search tool allows the model to query Google Search to find up-to-date information or information beyond its training data. This is useful for questions about recent events or highly specific topics. The code execution tool enables the model to generate and run Python code to perform calculations, manipulate data, or solve problems that are best handled algorithmically. The model receives the code's output and can use it in its response. With structured output , you can constrain Gemini to respond with JSON. This is particularly useful for integrating the model's output into applications. Function calling connects the thinking model to external tools and APIs, so it can reason about when to call the right function and what parameters to provide. URL Context provides the model with URLs as additional context for your prompt. The \ No newline at end of file diff --git a/docstore/da9ca531-b6e1-4a4f-b7f7-0f374fda1651 b/docstore/da9ca531-b6e1-4a4f-b7f7-0f374fda1651 new file mode 100644 index 0000000000000000000000000000000000000000..72fc1af4a198f3f6ddd1bd2823f7a9c0e142bbbb --- /dev/null +++ b/docstore/da9ca531-b6e1-4a4f-b7f7-0f374fda1651 @@ -0,0 +1 @@ +should be sent to flush any cached audio. The client can resume sending audio data at any time. Python # example audio file to try: # URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" # !wget -q $URL -O sample.pcm import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_bytes = Path ( "sample.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) # if stream gets paused, send: # await session.send_realtime_input(audio_stream_end=True) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // example audio file to try: // URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" // !wget -q $URL -O sample.pcm import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . \ No newline at end of file diff --git a/docstore/daaa350c-9ae3-42ff-a09f-cc80abe0da5d b/docstore/daaa350c-9ae3-42ff-a09f-cc80abe0da5d new file mode 100644 index 0000000000000000000000000000000000000000..933572d214295ba3502f9da18dc8703a97691ed7 --- /dev/null +++ b/docstore/daaa350c-9ae3-42ff-a09f-cc80abe0da5d @@ -0,0 +1 @@ +captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). * Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds, and their general characteristics. * Thesis Statement: State the main point of the essay, focusing on the remarkable adaptations, behaviors, and ecological significance that make hummingbirds extraordinary. ..... (gemini-2.5-flash) Zero-shot vs few-shot prompts You can include examples in the prompt that show the model what getting it right looks like. The model attempts to identify patterns and relationships from the examples and applies them when generating a response. Prompts that contain a few examples are called few-shot prompts, while prompts that provide no examples are called zero-shot prompts. Few-shot prompts are often used to regulate the formatting, phrasing, scoping, or general patterning of model responses. Use specific and varied examples to help the model narrow its focus and generate more accurate results. We recommend to always include few-shot examples in your prompts. Prompts without few-shot examples are likely to be less effective. In fact, you can remove instructions from your prompt if your examples are clear enough in showing the task at hand. The following zero-shot prompt asks the model to choose the best explanation. Prompt: Please choose the best explanation to the question: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Explanation1 is the better explanation because it provides more detail on the process, including how ice crystals combine and grow into snowflakes as they fall through the atmosphere. (gemini-2.5-flash) If your use case requires the \ No newline at end of file diff --git a/docstore/dacc7048-4298-476b-92f2-cf01ae840f7c b/docstore/dacc7048-4298-476b-92f2-cf01ae840f7c new file mode 100644 index 0000000000000000000000000000000000000000..465170b0f695eefaa56d7f3871f4f3df2424da62 --- /dev/null +++ b/docstore/dacc7048-4298-476b-92f2-cf01ae840f7c @@ -0,0 +1 @@ +brightness color_temp: Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. Returns: A dictionary containing the set brightness and color temperature. """ return { "brightness" : brightness , "colorTemperature" : color_temp } JavaScript import { Type } from '@google/genai' ; // Define a function that the model can call to control smart lights const setLightValuesFunctionDeclaration = { name : 'set_light_values' , description : 'Sets the brightness and color temperature of a light.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'Light level from 0 to 100. Zero is off and 100 is full brightness' , }, color_temp : { type : Type . STRING , enum : [ 'daylight' , 'cool' , 'warm' ], description : 'Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`.' , }, }, required : [ 'brightness' , 'color_temp' ], }, }; /** * Set the brightness and color temperature of a room light. (mock API) * @param {number} brightness - Light level from 0 to 100. Zero is off and 100 is full brightness * @param {string} color_temp - Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. * @return {Object} A dictionary containing the set brightness and color temperature. */ function setLightValues ( brightness , color_temp ) { return { brightness : brightness , colorTemperature : color_temp }; } Step 2: Call the model with function declarations Once you have defined your function declarations, you can prompt the model to use them. It analyzes the prompt and function declarations and decides whether to respond directly or to call a function. If a function is called, the response object will contain a function call suggestion. Python from google.genai import types # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ set_light_values_declaration ]) config = types . GenerateContentConfig ( tools = [ \ No newline at end of file diff --git a/docstore/dad8fbdf-a1a0-4ab9-9c01-e18cca80f974 b/docstore/dad8fbdf-a1a0-4ab9-9c01-e18cca80f974 new file mode 100644 index 0000000000000000000000000000000000000000..3d93e98c2388d03b02165eba8a61d6ed193d8415 --- /dev/null +++ b/docstore/dad8fbdf-a1a0-4ab9-9c01-e18cca80f974 @@ -0,0 +1 @@ +Prompt design strategies | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Prompt design strategies Prompt design is the process of creating prompts, or natural language requests, that elicit accurate, high quality responses from a language model. This page introduces basic concepts, strategies, and best practices to get you started designing prompts to get the most out of Gemini AI models. Topic-specific prompt guides Looking for more specific prompt strategies? Check out our other prompting guides on: Prompting with media files Prompting for image generation Prompting for video generation Google AI Studio also hosts a prompt gallery meant to interactively showcase many of the concepts shared in this guide. Clear and specific instructions An effective and efficient way to customize model behavior is to provide it with clear and specific instructions. Instructions can be in the form of a question, step-by-step tasks, or as complex as mapping out a user's experience and mindset. Input Input is the required text in the prompt that you want the model to provide a response to. Inputs can be a question that the model answers (question input), a task the model performs (task input), an entity the model operates on (entity input), or partial input that the model completes or continues (completion input). Input type Prompt Generated output Question What's a good name for a flower shop that specializes in selling bouquets of dried flowers? Create a list of 5 options with just the names. Here are 10 names for a flower shop specializing in dried flowers: 1. Everlasting Blooms 2. Dried & Delightful 3. The Petal Preserve 4. Whispers of Wildflowers \ No newline at end of file diff --git a/docstore/daf0ce4c-0c1d-4ebc-a66d-c1fc01d66397 b/docstore/daf0ce4c-0c1d-4ebc-a66d-c1fc01d66397 new file mode 100644 index 0000000000000000000000000000000000000000..76ace95fe58e8b11b8d859b0314962fe72bedd09 --- /dev/null +++ b/docstore/daf0ce4c-0c1d-4ebc-a66d-c1fc01d66397 @@ -0,0 +1 @@ +() { const payload = { contents : [ { parts : [ { text : 'Explain how AI works' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Multi-turn conversations (Chat) Our SDKs provide functionality to collect multiple rounds of prompts and responses into a chat, giving you an easy way to keep track of the conversation history. Note: Chat functionality is only implemented as part of the SDKs. Behind the scenes, it still uses the generateContent API. For multi-turn conversations, the full conversation history is sent to the model with each follow-up turn. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message ( "I have 2 dogs in my house." ) print ( response . text ) response = chat . send_message ( "How many paws are in my house?" ) print ( response . text ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const response1 = await chat . sendMessage ({ message : "I have 2 dogs in my house." , }); console . log ( "Chat response 1:" , response1 . text ); const response2 = await chat . sendMessage ({ message : "How many paws are in \ No newline at end of file diff --git a/docstore/daf5889d-f60b-4700-b985-815391ba92c1 b/docstore/daf5889d-f60b-4700-b985-815391ba92c1 new file mode 100644 index 0000000000000000000000000000000000000000..a97f8e9b4e6ce1d85f8c4b9bed31a9676bb7c602 --- /dev/null +++ b/docstore/daf5889d-f60b-4700-b985-815391ba92c1 @@ -0,0 +1 @@ +Files API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Files API The Gemini family of artificial intelligence (AI) models is built to handle various types of input data, including text, images, and audio. Since these models can handle more than one type or mode of data, the Gemini models are called multimodal models or explained as having multimodal capabilities . This guide shows you how to work with media files using the Files API. The basic operations are the same for audio files, images, videos, documents, and other supported file types. For file prompting guidance, check out the File prompt guide section. Upload a file You can use the Files API to upload a media file. Always use the Files API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20 MB. The following code uploads a file and then uses the file in a call to generateContent . Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp3" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ "Describe this audio clip" , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Describe this audio \ No newline at end of file diff --git a/docstore/db0d938c-358b-4c84-82af-fd3a28f5876f b/docstore/db0d938c-358b-4c84-82af-fd3a28f5876f new file mode 100644 index 0000000000000000000000000000000000000000..4be5f755f85341ea5e2bdefd869bac0db4833f39 --- /dev/null +++ b/docstore/db0d938c-358b-4c84-82af-fd3a28f5876f @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-2.5-flash-lite Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/db3316a1-3fd3-40db-bc06-dddce3f9b9c1 b/docstore/db3316a1-3fd3-40db-bc06-dddce3f9b9c1 new file mode 100644 index 0000000000000000000000000000000000000000..3d32a6c6f44782138d2600dc9a5e7c5bf75a9a24 --- /dev/null +++ b/docstore/db3316a1-3fd3-40db-bc06-dddce3f9b9c1 @@ -0,0 +1 @@ +in 3 sentences."}, { "file_data": { "file_uri": "https://www.youtube.com/watch?v=9hE5-98ZeCg" } } ] }] }' 2 > /dev/null Refer to timestamps in the content You can ask questions about specific points in time within the video using timestamps of the form MM:SS . Python prompt = "What are the examples given at 00:05 and 00:10 supposed to show us?" # Adjusted timestamps for the NASA video JavaScript const prompt = "What are the examples given at 00:05 and 00:10 supposed to show us?" ; Go prompt := [] * genai . Part { genai . NewPartFromURI ( currentVideoFile . URI , currentVideoFile . MIMEType ), // Adjusted timestamps for the NASA video genai . NewPartFromText ( "What are the examples given at 00:05 and " + "00:10 supposed to show us?" ), } REST PROMPT = "What are the examples given at 00:05 and 00:10 supposed to show us?" Transcribe video and provide visual descriptions The Gemini models can transcribe and provide visual descriptions of video content by processing both the audio track and visual frames. For visual descriptions, the model samples the video at a rate of 1 frame per second . This sampling rate may affect the level of detail in the descriptions, particularly for videos with rapidly changing visuals. Python prompt = "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions." JavaScript const prompt = "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions." ; Go prompt := [] * genai . Part { genai . NewPartFromURI ( currentVideoFile . URI , currentVideoFile . MIMEType ), genai . NewPartFromText ( "Transcribe the audio from this video, giving timestamps for salient events in the video. Also " + "provide visual descriptions." ), } REST PROMPT = "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions." Customize video processing You can customize video processing \ No newline at end of file diff --git a/docstore/db39048b-9821-4581-85ff-2e2f5268360e b/docstore/db39048b-9821-4581-85ff-2e2f5268360e new file mode 100644 index 0000000000000000000000000000000000000000..ee362b7ee9b7c0daf473abb2b9ab78abcbfb374b --- /dev/null +++ b/docstore/db39048b-9821-4581-85ff-2e2f5268360e @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/function-calling Title: Function calling with the Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/db5f3607-a607-44a9-9838-b75da2ff5ee7 b/docstore/db5f3607-a607-44a9-9838-b75da2ff5ee7 new file mode 100644 index 0000000000000000000000000000000000000000..c6b31d98c2b16f02ca76b6157fd854c0b53727af --- /dev/null +++ b/docstore/db5f3607-a607-44a9-9838-b75da2ff5ee7 @@ -0,0 +1 @@ +: [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Thinking with Gemini 2.5 2.5 Flash and Pro models have "thinking" enabled by default to enhance quality, which may take longer to run and increase token usage. When using 2.5 Flash, you can disable thinking by setting the thinking budget to zero. For more details, see the thinking guide . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # Disables thinking ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , config : { thinkingConfig : { thinkingBudget : 0 , // Disables thinking }, } }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "How does AI work?" ), & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : int32 ( 0 ), // Disables thinking }, } ) \ No newline at end of file diff --git a/docstore/db89eb3c-564d-4ede-9b28-55bfe7c8f924 b/docstore/db89eb3c-564d-4ede-9b28-55bfe7c8f924 new file mode 100644 index 0000000000000000000000000000000000000000..aa09fa8779a782eb0f4519da995c2b766869468f --- /dev/null +++ b/docstore/db89eb3c-564d-4ede-9b28-55bfe7c8f924 @@ -0,0 +1 @@ +genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } Refer to timestamps You can refer to specific sections of an audio file using timestamps of the form MM:SS . For example, the following prompt requests a transcript that Starts at 2 minutes 30 seconds from the beginning of the file. Ends at 3 minutes 29 seconds from the beginning of the file. Python # Create a prompt containing timestamps. prompt = "Provide a transcript of the speech from 02:30 to 03:29." JavaScript // Create a prompt containing timestamps. const prompt = "Provide a transcript of the speech from 02:30 to 03:29." Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } localAudioPath := "/path/to/sample.mp3" uploadedFile , _ := client . Files . UploadFromPath ( ctx , localAudioPath , nil , ) parts := [] * genai . Part { genai . NewPartFromText ( "Provide a transcript of the speech " + "between the timestamps 02:30 and 03:29." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } Count tokens Call the countTokens method to get a count of the number of tokens in an audio file. For example: Python response = client . models . count_tokens ( model = 'gemini-2.5-flash' , contents = [ myfile ] ) print ( response ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); const myfile = await ai \ No newline at end of file diff --git a/docstore/db984386-666f-440e-b53b-3afed56987ab b/docstore/db984386-666f-440e-b53b-3afed56987ab new file mode 100644 index 0000000000000000000000000000000000000000..b56dfbca2298dddb94eb66ba2a6b5e2a4d7109fa --- /dev/null +++ b/docstore/db984386-666f-440e-b53b-3afed56987ab @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "' " ${ MIME1_TYPE } " '", "file_uri": ' $file1_uri '}}, { "inline_data": { "mime_type":"' " ${ MIME2_TYPE } " '", "data": "' " $IMAGE2_BASE64 " '" } } ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Object detection From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. Python from google import genai from google.genai import types from PIL import Image import json client = genai . Client () prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image . open ( "/path/to/image.png" ) config = types . GenerateContentConfig ( response_mime_type = "application/json" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , prompt ], config = config ) width , height = image . size bounding_boxes = json . loads ( response . text ) converted_bounding_boxes = [] for bounding_box in bounding_boxes : abs_y1 = int ( bounding_box [ "box_2d" ][ 0 ] / 1000 * height ) abs_x1 = int ( bounding_box [ "box_2d" ][ 1 ] / 1000 * width ) abs_y2 = int ( bounding_box [ "box_2d" ][ 2 ] / 1000 * height ) abs_x2 = int ( bounding_box [ "box_2d" ][ 3 ] / 1000 * width ) converted_bounding_boxes . append ([ abs_x1 , abs_y1 , abs_x2 , abs_y2 ]) print ( "Image size: " , width , height ) print ( "Bounding boxes:" , converted_bounding_boxes ) Note: The model also supports generating bounding boxes based on custom \ No newline at end of file diff --git a/docstore/dba1c6df-65ed-40be-b1e9-04ef2d57ec98 b/docstore/dba1c6df-65ed-40be-b1e9-04ef2d57ec98 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/dba1c6df-65ed-40be-b1e9-04ef2d57ec98 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/dbb0fdaf-8f31-4220-ab91-70c93f511420 b/docstore/dbb0fdaf-8f31-4220-ab91-70c93f511420 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/dbb0fdaf-8f31-4220-ab91-70c93f511420 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/dbba30e8-d80e-4e4d-a712-1998188e75d0 b/docstore/dbba30e8-d80e-4e4d-a712-1998188e75d0 new file mode 100644 index 0000000000000000000000000000000000000000..fd0551506f8bbd5887cf64565b1e4a5e868fecf1 --- /dev/null +++ b/docstore/dbba30e8-d80e-4e4d-a712-1998188e75d0 @@ -0,0 +1 @@ +learn more about the latest YOUR_subject . Code examples with URL context only Python from google import genai from google.genai.types import Tool , GenerateContentConfig , GoogleSearch client = genai . Client () model_id = "gemini-2.5-flash" url_context_tool = Tool ( url_context = types . UrlContext ) response = client . models . generate_content ( model = model_id , contents = "Compare recipes from YOUR_URL1 and YOUR_URL2 " , config = GenerateContentConfig ( tools = [ url_context_tool ], response_modalities = [ "TEXT" ], ) ) for each in response . candidates [ 0 ] . content . parts : print ( each . text ) # get URLs retrieved for context print ( response . candidates [ 0 ] . url_context_metadata ) Javascript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ "Compare recipes from YOUR_URL1 and YOUR_URL2 " , ], config : { tools : [{ urlContext : {}}], }, }); console . log ( response . text ); // To get URLs retrieved for context console . log ( response . candidates [ 0 ]. urlContextMetadata ) } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -d '{ "contents": [ { "parts": [ {"text": "Compare recipes from YOUR_URL1 and YOUR_URL2 "} ] } ], "tools": [ { "url_context": {} } ] }' > result.json cat result.json Code examples with Grounding with Google Search Python from google import genai from google.genai.types import Tool , GenerateContentConfig , GoogleSearch client = genai . Client () model_id = "gemini-2.5-flash" tools = [] tools . append ( Tool ( url_context = types . UrlContext )) tools . append ( Tool ( google_search = types . GoogleSearch )) response = client . models . generate_content ( model = model_id , contents = "Give me three day events schedule based on \ No newline at end of file diff --git a/docstore/dbe8ff47-c39d-4057-bd41-f9920d169b28 b/docstore/dbe8ff47-c39d-4057-bd41-f9920d169b28 new file mode 100644 index 0000000000000000000000000000000000000000..150f8758ce4500c63fdc2d62f5bb812ca3b2d976 --- /dev/null +++ b/docstore/dbe8ff47-c39d-4057-bd41-f9920d169b28 @@ -0,0 +1 @@ +client-side (browser based) applications // Consider using Ephemeral Tokens instead // More information at: https://ai.google.dev/gemini-api/docs/ephemeral-tokens // Half cascade model: // const model = "gemini-live-2.5-flash-preview" // Native audio output model: const model = "gemini-2.5-flash-preview-native-audio-dialog" const config = { responseModalities : [ Modality . AUDIO ], systemInstruction : "You are a helpful assistant and answer in a friendly tone." }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); \ No newline at end of file diff --git a/docstore/dbf42ed0-68a8-4f0d-97e4-4ba84714c0f7 b/docstore/dbf42ed0-68a8-4f0d-97e4-4ba84714c0f7 new file mode 100644 index 0000000000000000000000000000000000000000..65337d81cbf9fba76eb8d44ddc68611350b61de7 --- /dev/null +++ b/docstore/dbf42ed0-68a8-4f0d-97e4-4ba84714c0f7 @@ -0,0 +1 @@ += lambda s : s . segment . end_index , reverse = True ) for support in sorted_supports : end_index = support . segment . end_index if support . grounding_chunk_indices : # Create citation string like [1](link1)[2](link2) citation_links = [] for i in support . grounding_chunk_indices : if i < len ( chunks ): uri = chunks [ i ] . web . uri citation_links . append ( f "[ { i + 1 } ]( { uri } )" ) citation_string = ", " . join ( citation_links ) text = text [: end_index ] + citation_string + text [ end_index :] return text # Assuming response with grounding metadata text_with_citations = add_citations ( response ) print ( text_with_citations ) JavaScript function addCitations ( response ) { let text = response . text ; const supports = response . candidates [ 0 ] ? . groundingMetadata ? . groundingSupports ; const chunks = response . candidates [ 0 ] ? . groundingMetadata ? . groundingChunks ; // Sort supports by end_index in descending order to avoid shifting issues when inserting. const sortedSupports = [... supports ]. sort ( ( a , b ) = > ( b . segment ? . endIndex ?? 0 ) - ( a . segment ? . endIndex ?? 0 ), ); for ( const support of sortedSupports ) { const endIndex = support . segment ? . endIndex ; if ( endIndex === undefined || ! support . groundingChunkIndices ? . length ) { continue ; } const citationLinks = support . groundingChunkIndices . map ( i = > { const uri = chunks [ i ] ? . web ? . uri ; if ( uri ) { return `[ ${ i + 1 } ]( ${ uri } )` ; } return null ; }) . filter ( Boolean ); if ( citationLinks . length > 0 ) { const citationString = citationLinks . join ( ", " ); text = text . slice ( 0 , endIndex ) + citationString + text . slice ( endIndex ); } } return text ; } const textWithCitations = addCitations ( response ); console . log ( textWithCitations ); The new response with inline citations will look like this: Spain won Euro 2024, defeating England 2-1 in the final.[1](https:/...), [2](https:/...), [4](https:/...), [5](https:/...) This victory \ No newline at end of file diff --git a/docstore/dc1343c8-7091-4305-bd62-a82a192561e1 b/docstore/dc1343c8-7091-4305-bd62-a82a192561e1 new file mode 100644 index 0000000000000000000000000000000000000000..b3339b694e68c4d7176324567b6e6d7542786980 --- /dev/null +++ b/docstore/dc1343c8-7091-4305-bd62-a82a192561e1 @@ -0,0 +1 @@ +YouTube video per day. For the paid tier, there is no limit based on video length. For models before 2.5, you can upload only 1 video per request. For models after 2.5, you can upload a maximum of 10 videos per request. You can only upload public videos (not private or unlisted videos). The following example shows how to include a YouTube URL with a prompt: Python response = client . models . generate_content ( model = 'models/gemini-2.0-flash' , contents = types . Content ( parts = [ types . Part ( file_data = types . FileData ( file_uri = 'https://www.youtube.com/watch?v=9hE5-98ZeCg' ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( process . env . GOOGLE_API_KEY ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-pro" }); const result = await model . generateContent ([ "Please summarize the video in 3 sentences." , { fileData : { fileUri : "https://www.youtube.com/watch?v=9hE5-98ZeCg" , }, }, ]); console . log ( result . response . text ()); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } parts := [] * genai . Part { genai . NewPartFromText ( "Please summarize the video in 3 sentences." ), genai . NewPartFromURI ( "https://www.youtube.com/watch?v=9hE5-98ZeCg" , "video/mp4" ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Please summarize the video \ No newline at end of file diff --git a/docstore/dc226e95-373f-4702-9c8c-830a41ab8ddd b/docstore/dc226e95-373f-4702-9c8c-830a41ab8ddd new file mode 100644 index 0000000000000000000000000000000000000000..6cd03da77a7c04bb143fe9601905375d481c4c1f --- /dev/null +++ b/docstore/dc226e95-373f-4702-9c8c-830a41ab8ddd @@ -0,0 +1 @@ +Gemini 2.0 Flash Preview Image Generation 2,000 3,000,000 100,000 Gemini 2.0 Flash-Lite 20,000 10,000,000 -- Imagen 4 Standard 10 -- 70 Imagen 4 Ultra 5 -- 30 Imagen 3 20 -- -- Veo 2 2 videos per minute -- 50 videos per day Gemini 1.5 Flash (Deprecated) 2,000 4,000,000 -- Gemini 1.5 Flash-8B (Deprecated) 4,000 4,000,000 -- Gemini 1.5 Pro (Deprecated) 1,000 4,000,000 -- Gemma 3 & 3n 30 15,000 14,400 Gemini Embedding Experimental 03-07 10 -- 1,000 Tier 3 Model RPM TPM RPD Gemini 2.5 Pro 2,000 8,000,000 -- Gemini 2.5 Flash 10,000 8,000,000 -- Gemini 2.5 Flash-Lite Preview 06-17 30,000 30,000,000 -- Gemini 2.5 Flash Preview TTS 1,000 1,000,000 -- Gemini 2.5 Pro Preview TTS 100 1,000,000 -- Gemini 2.0 Flash 30,000 30,000,000 -- Gemini 2.0 Flash Preview Image Generation 5,000 5,000,000 -- Gemini 2.0 Flash-Lite 30,000 30,000,000 -- Imagen 4 Standard 10 -- 70 Imagen 4 Ultra 5 -- 30 Imagen 3 20 -- -- Veo 2 2 videos per minute -- 50 videos per day Gemini 1.5 Flash (Deprecated) 2,000 4,000,000 -- Gemini 1.5 Flash-8B (Deprecated) 4,000 4,000,000 -- Gemini 1.5 Pro (Deprecated) 1,000 4,000,000 -- Gemma 3 & 3n 30 15,000 14,400 Gemini Embedding Experimental 03-07 10 -- 1,000 Specified rate limits are not guaranteed and actual capacity may vary. Live API rate limits The Live API processes continuous streams of audio, video, or text to deliver immediate, human-like spoken responses, creating a natural conversational experience for your users. This API has a different set of rate limits than the standard Gemini API calls. Free Tier Name Concurrent sessions TPM RPD Gemini 2.5 Flash Live 3 1,000,000 -- Gemini 2.0 Flash Live 3 1,000,000 -- Gemini 2.5 Flash Preview Native Audio Dialog 1 25,000 5 Gemini 2.5 Flash Experimental Native Audio Thinking Dialog 1 10,000 5 Tier 1 Name Concurrent sessions TPM RPD Gemini 2.5 Flash Live 50 4,000,000 -- Gemini 2.0 Flash Live 50 4,000,000 -- Gemini 2.5 Flash Preview Native Audio Dialog 3 50,000 50 Gemini 2.5 Flash Experimental Native Audio Thinking \ No newline at end of file diff --git a/docstore/dc28c0a9-9049-49fa-9ea9-ce74c3cedcdb b/docstore/dc28c0a9-9049-49fa-9ea9-ce74c3cedcdb new file mode 100644 index 0000000000000000000000000000000000000000..70023e78d7b8c9459310371555beec5ba6328e28 --- /dev/null +++ b/docstore/dc28c0a9-9049-49fa-9ea9-ce74c3cedcdb @@ -0,0 +1 @@ +results reflects a single function call that the model has requested. To send the results back, include the responses in the same order as they were requested. The Python SDK supports automatic function calling , which automatically converts Python functions to declarations, handles the function call execution and response cycle for you. Following is an example for the disco use case. Note: Automatic Function Calling is a Python SDK only feature at the moment. Python from google import genai from google.genai import types # Actual function implementations def power_disco_ball_impl ( power : bool ) - > dict : """Powers the spinning disco ball. Args: power: Whether to turn the disco ball on or off. Returns: A status dictionary indicating the current state. """ return { "status" : f "Disco ball powered { 'on' if power else 'off' } " } def start_music_impl ( energetic : bool , loud : bool ) - > dict : """Play some music matching the specified parameters. Args: energetic: Whether the music is energetic or not. loud: Whether the music is loud or not. Returns: A dictionary containing the music settings. """ music_type = "energetic" if energetic else "chill" volume = "loud" if loud else "quiet" return { "music_type" : music_type , "volume" : volume } def dim_lights_impl ( brightness : float ) - > dict : """Dim the lights. Args: brightness: The brightness of the lights, 0.0 is off, 1.0 is full. Returns: A dictionary containing the new brightness setting. """ return { "brightness" : brightness } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ power_disco_ball_impl , start_music_impl , dim_lights_impl ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Do everything you need to this place into party!" , config = config , ) print ( " \n Example 2: Automatic function calling" ) print ( response . text ) # I've turned on the disco ball, started playing loud and \ No newline at end of file diff --git a/docstore/dc370a8c-3dcc-4818-bd03-c69bf7f1b638 b/docstore/dc370a8c-3dcc-4818-bd03-c69bf7f1b638 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/dc370a8c-3dcc-4818-bd03-c69bf7f1b638 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/dc474b12-50e6-41d3-ae8e-cb79cd16c085 b/docstore/dc474b12-50e6-41d3-ae8e-cb79cd16c085 new file mode 100644 index 0000000000000000000000000000000000000000..35cb4724ff25e15eeabaea84a051d2545b73055e --- /dev/null +++ b/docstore/dc474b12-50e6-41d3-ae8e-cb79cd16c085 @@ -0,0 +1 @@ +model to produce concise responses, you can include examples in the prompt that give preference to concise responses. The following prompt provides two examples that show preference to the shorter explanations. In the response, you can see that the examples guided the model to choose the shorter explanation ( Explanation2 ) as opposed to the longer explanation ( Explanation1 ) like it did previously. Prompt: Below are some examples showing a question, explanation, and answer format: Question: Why is the sky blue? Explanation1: The sky appears blue because of Rayleigh scattering, which causes shorter blue wavelengths of light to be scattered more easily than longer red wavelengths, making the sky look blue. Explanation2: Due to Rayleigh scattering effect. Answer: Explanation2 Question: What is the cause of earthquakes? Explanation1: Sudden release of energy in the Earth's crust. Explanation2: Earthquakes happen when tectonic plates suddenly slip or break apart, causing a release of energy that creates seismic waves that can shake the ground and cause damage. Answer: Explanation1 Now, Answer the following question given the example formats above: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Answer: Explanation2 (gemini-2.5-flash) Optimal number of examples Models like Gemini can often pick up on patterns using a few examples, though you may need to experiment with the number of examples to provide in the prompt for the best results. At the same time, if you include too many examples, the model may start to overfit the response to the examples. Patterns vs anti patterns Using examples to show the model a pattern to follow is more effective than using examples to show the model an anti pattern \ No newline at end of file diff --git a/docstore/dc535b54-512e-46ab-b3eb-35b81a668c91 b/docstore/dc535b54-512e-46ab-b3eb-35b81a668c91 new file mode 100644 index 0000000000000000000000000000000000000000..7dc87b548e2d57526821a9c12df5e47c7e7e0e83 --- /dev/null +++ b/docstore/dc535b54-512e-46ab-b3eb-35b81a668c91 @@ -0,0 +1 @@ +. thoughtsTokenCount } ` ); console . log ( `Output tokens: ${ response . usageMetadata . candidatesTokenCount } ` ); Go // ... usageMetadata , err := json . MarshalIndent ( response . UsageMetadata , "" , " " ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Thoughts tokens:" , string ( usageMetadata . thoughts_token_count )) fmt . Println ( "Output tokens:" , string ( usageMetadata . candidates_token_count )) Thinking models generate full thoughts to improve the quality of the final response, and then output summaries to provide insight into the thought process. So, pricing is based on the full thought tokens the model needs to generate to create a summary, despite only the summary being output from the API. You can learn more about tokens in the Token counting guide. Supported models Thinking features are supported on all the 2.5 series models. You can find all model capabilities on the model overview page. Best practices This section includes some guidance for using thinking models efficiently. As always, following our prompting guidance and best practices will get you the best results. Debugging and steering Review reasoning : When you're not getting your expected response from the thinking models, it can help to carefully analyze Gemini's thought summaries. You can see how it broke down the task and arrived at its conclusion, and use that information to correct towards the right results. Provide Guidance in Reasoning : If you're hoping for a particularly lengthy output, you may want to provide guidance in your prompt to constrain the amount of thinking the model uses. This lets you reserve more of the token output for your response. Task complexity Easy Tasks (Thinking could be OFF): For straightforward requests where complex reasoning isn't required, such as fact retrieval or classification, thinking is not required. Examples include: "Where was DeepMind founded?" "Is this email asking for a meeting or just providing information?" Medium Tasks \ No newline at end of file diff --git a/docstore/dc566bf0-c307-47f0-a00c-b0e4e6b6562c b/docstore/dc566bf0-c307-47f0-a00c-b0e4e6b6562c new file mode 100644 index 0000000000000000000000000000000000000000..af04e93a60e88c829b3aa07a575f51b8b1fa9ccf --- /dev/null +++ b/docstore/dc566bf0-c307-47f0-a00c-b0e4e6b6562c @@ -0,0 +1 @@ +TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : false , // default startOfSpeechSensitivity : StartSensitivity . START_SENSITIVITY_LOW , endOfSpeechSensitivity : EndSensitivity . END_SENSITIVITY_LOW , prefixPaddingMs : 20 , silenceDurationMs : 100 , } } }; Disable automatic VAD Alternatively, the automatic VAD can be disabled by setting realtimeInputConfig.automaticActivityDetection.disabled to true in the setup message. In this configuration the client is responsible for detecting user speech and sending activityStart and activityEnd messages at the appropriate times. An audioStreamEnd isn't sent in this configuration. Instead, any interruption of the stream is marked by an activityEnd message. Python config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : True }}, } async with client . aio . live . connect ( model = model , config = config ) as session : # ... await session . send_realtime_input ( activity_start = types . ActivityStart ()) await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) await session . send_realtime_input ( activity_end = types . ActivityEnd ()) # ... JavaScript const config = { responseModalities : [ Modality . TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : true , } } }; session . sendRealtimeInput ({ activityStart : {} }) session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); session . sendRealtimeInput ({ activityEnd : {} }) Token count You can find the total number of consumed tokens in the usageMetadata field of the returned server message. Python async for message in session . receive (): # The server will periodically send messages that include UsageMetadata. if message . usage_metadata : usage = message . usage_metadata print ( f "Used { usage . total_token_count } tokens in total. Response token \ No newline at end of file diff --git a/docstore/dc56aae0-fe4e-4c44-b46d-3e6bafcff36f b/docstore/dc56aae0-fe4e-4c44-b46d-3e6bafcff36f new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/dc56aae0-fe4e-4c44-b46d-3e6bafcff36f @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/dc5cb43c-5e76-4f30-ba42-3893747eff79 b/docstore/dc5cb43c-5e76-4f30-ba42-3893747eff79 new file mode 100644 index 0000000000000000000000000000000000000000..bebef94a2670f1eebe768f4cb4680db515bfe743 --- /dev/null +++ b/docstore/dc5cb43c-5e76-4f30-ba42-3893747eff79 @@ -0,0 +1 @@ +it's recommended to provide a single message summary to free up the context window for subsequent interactions. See Session Resumption for another method for loading session context. Sending and receiving audio The most common audio example, audio-to-audio , is covered in the Getting started guide. Here's an audio-to-text example that reads a WAV file, sends it in the correct format and receives text output: Python # Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav # Install helpers for converting files: pip install librosa soundfile import asyncio import io from pathlib import Path from google import genai from google.genai import types import soundfile as sf import librosa client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : buffer = io . BytesIO () y , sr = librosa . load ( "sample.wav" , sr = 16000 ) sf . write ( buffer , y , sr , format = 'RAW' , subtype = 'PCM_16' ) buffer . seek ( 0 ) audio_bytes = buffer . read () # If already in correct format, you can use this: # audio_bytes = Path("sample.pcm").read_bytes() await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav // Install helpers for converting files: npm install wavefile import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue \ No newline at end of file diff --git a/docstore/dc7ca2fc-7797-4794-bad5-8009becbb00b b/docstore/dc7ca2fc-7797-4794-bad5-8009becbb00b new file mode 100644 index 0000000000000000000000000000000000000000..2dce4b1915975420243f156ab22de6a07e8b5cc9 --- /dev/null +++ b/docstore/dc7ca2fc-7797-4794-bad5-8009becbb00b @@ -0,0 +1 @@ +The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/dc7d043a-ef72-4978-a691-288f8cfa9e65 b/docstore/dc7d043a-ef72-4978-a691-288f8cfa9e65 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/dc7d043a-ef72-4978-a691-288f8cfa9e65 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/dc7f7be8-9d91-4966-bbec-85ef573c844a b/docstore/dc7f7be8-9d91-4966-bbec-85ef573c844a new file mode 100644 index 0000000000000000000000000000000000000000..08922eb1e5da83e7a67a2a4aeaf4437890d1333a --- /dev/null +++ b/docstore/dc7f7be8-9d91-4966-bbec-85ef573c844a @@ -0,0 +1 @@ +trademark of Oracle and/or its affiliates. Last updated 2025-05-31 UTC. \ No newline at end of file diff --git a/docstore/dca4ac52-a413-45f4-88cf-9f9512a8aa8c b/docstore/dca4ac52-a413-45f4-88cf-9f9512a8aa8c new file mode 100644 index 0000000000000000000000000000000000000000..a8bbf59ddedb6c7213df1bd811fe6d2ce8316cfa --- /dev/null +++ b/docstore/dca4ac52-a413-45f4-88cf-9f9512a8aa8c @@ -0,0 +1 @@ +. live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-exp-native-audio-thinking-dialog' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Voice Activity Detection (VAD) Voice Activity Detection (VAD) allows the model to recognize when a person is speaking. This is essential for creating natural conversations, as it allows a user to interrupt the model at any time. When VAD detects an interruption, the ongoing generation is canceled and discarded. Only the information already sent to the client is retained in the session history. The server then sends a BidiGenerateContentServerContent message to report the interruption. The Gemini server then discards any pending function calls and sends a BidiGenerateContentServerContent message with the IDs of the canceled calls. Python async for response in session . receive (): if response . server_content . interrupted is True : # The generation was interrupted # If realtime playback is implemented in your application, # you should stop playing audio and clear queued playback here. JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . interrupted ) { // The generation was interrupted // If realtime playback is implemented in your application, // you should stop playing audio and clear queued playback here. } } Automatic VAD By default, the model automatically performs VAD on a continuous audio input stream. VAD can be configured with the realtimeInputConfig.automaticActivityDetection field of the setup configuration . When the audio stream is paused for more than a second (for example, because the user switched off the microphone), an audioStreamEnd event \ No newline at end of file diff --git a/docstore/dca52453-6668-42af-b3ab-b0b62c763327 b/docstore/dca52453-6668-42af-b3ab-b0b62c763327 new file mode 100644 index 0000000000000000000000000000000000000000..34a11694c4ec18846b4474603e1c63c04851d790 --- /dev/null +++ b/docstore/dca52453-6668-42af-b3ab-b0b62c763327 @@ -0,0 +1 @@ +function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'When did the last Brazil vs. Argentina soccer match happen?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . modelTurn && turn . serverContent . modelTurn . parts ) { for ( const part of turn . serverContent . modelTurn . parts ) { if ( part . text ) { console . debug ( 'Received text: %s\n' , part . text ); } else if ( part . executableCode ) { console . debug ( 'executableCode: %s\n' , part . executableCode . code ); } else if ( part . codeExecutionResult ) { console . debug ( 'codeExecutionResult: %s\n' , part . codeExecutionResult . output ); } } } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Combining multiple tools You can combine multiple tools within the Live API, increasing your application's capabilities even more: Python prompt = """ Hey, I need you to do three things for me. 1. Compute the largest prime palindrome under 100000. 2. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024? 3. Turn on the lights Thanks! """ tools = [ { "google_search" : {}}, { "code_execution" : {}}, { "function_declarations" : [ turn_on_the_lights , turn_off_the_lights ]}, ] config = { "response_modalities" : [ "TEXT" ], "tools" : tools } # ... remaining model call JavaScript const prompt = `Hey, I need you to do three things for me. 1. Compute the largest prime palindrome under 100000. 2. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024? 3. Turn on the lights Thanks! ` const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ \ No newline at end of file diff --git a/docstore/dca85733-3d68-4d8e-9c4a-572322a14015 b/docstore/dca85733-3d68-4d8e-9c4a-572322a14015 new file mode 100644 index 0000000000000000000000000000000000000000..465170b0f695eefaa56d7f3871f4f3df2424da62 --- /dev/null +++ b/docstore/dca85733-3d68-4d8e-9c4a-572322a14015 @@ -0,0 +1 @@ +brightness color_temp: Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. Returns: A dictionary containing the set brightness and color temperature. """ return { "brightness" : brightness , "colorTemperature" : color_temp } JavaScript import { Type } from '@google/genai' ; // Define a function that the model can call to control smart lights const setLightValuesFunctionDeclaration = { name : 'set_light_values' , description : 'Sets the brightness and color temperature of a light.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'Light level from 0 to 100. Zero is off and 100 is full brightness' , }, color_temp : { type : Type . STRING , enum : [ 'daylight' , 'cool' , 'warm' ], description : 'Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`.' , }, }, required : [ 'brightness' , 'color_temp' ], }, }; /** * Set the brightness and color temperature of a room light. (mock API) * @param {number} brightness - Light level from 0 to 100. Zero is off and 100 is full brightness * @param {string} color_temp - Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. * @return {Object} A dictionary containing the set brightness and color temperature. */ function setLightValues ( brightness , color_temp ) { return { brightness : brightness , colorTemperature : color_temp }; } Step 2: Call the model with function declarations Once you have defined your function declarations, you can prompt the model to use them. It analyzes the prompt and function declarations and decides whether to respond directly or to call a function. If a function is called, the response object will contain a function call suggestion. Python from google.genai import types # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ set_light_values_declaration ]) config = types . GenerateContentConfig ( tools = [ \ No newline at end of file diff --git a/docstore/dcb58817-4586-49a8-8033-33744218b4e7 b/docstore/dcb58817-4586-49a8-8033-33744218b4e7 new file mode 100644 index 0000000000000000000000000000000000000000..ebc7dcd270f0e6ef8ac74f3b151799b5da91d508 --- /dev/null +++ b/docstore/dcb58817-4586-49a8-8033-33744218b4e7 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/function-calling/tutorial#main-content Title: Function calling with the Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/dcbefb56-b2d1-4d41-9a0a-1a5779af399f b/docstore/dcbefb56-b2d1-4d41-9a0a-1a5779af399f new file mode 100644 index 0000000000000000000000000000000000000000..70023e78d7b8c9459310371555beec5ba6328e28 --- /dev/null +++ b/docstore/dcbefb56-b2d1-4d41-9a0a-1a5779af399f @@ -0,0 +1 @@ +results reflects a single function call that the model has requested. To send the results back, include the responses in the same order as they were requested. The Python SDK supports automatic function calling , which automatically converts Python functions to declarations, handles the function call execution and response cycle for you. Following is an example for the disco use case. Note: Automatic Function Calling is a Python SDK only feature at the moment. Python from google import genai from google.genai import types # Actual function implementations def power_disco_ball_impl ( power : bool ) - > dict : """Powers the spinning disco ball. Args: power: Whether to turn the disco ball on or off. Returns: A status dictionary indicating the current state. """ return { "status" : f "Disco ball powered { 'on' if power else 'off' } " } def start_music_impl ( energetic : bool , loud : bool ) - > dict : """Play some music matching the specified parameters. Args: energetic: Whether the music is energetic or not. loud: Whether the music is loud or not. Returns: A dictionary containing the music settings. """ music_type = "energetic" if energetic else "chill" volume = "loud" if loud else "quiet" return { "music_type" : music_type , "volume" : volume } def dim_lights_impl ( brightness : float ) - > dict : """Dim the lights. Args: brightness: The brightness of the lights, 0.0 is off, 1.0 is full. Returns: A dictionary containing the new brightness setting. """ return { "brightness" : brightness } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ power_disco_ball_impl , start_music_impl , dim_lights_impl ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Do everything you need to this place into party!" , config = config , ) print ( " \n Example 2: Automatic function calling" ) print ( response . text ) # I've turned on the disco ball, started playing loud and \ No newline at end of file diff --git a/docstore/dcc04765-58cf-4b40-b15c-f3649dcc95da b/docstore/dcc04765-58cf-4b40-b15c-f3649dcc95da new file mode 100644 index 0000000000000000000000000000000000000000..9b0480f81615786d4da1611ac72ab96b5af43602 --- /dev/null +++ b/docstore/dcc04765-58cf-4b40-b15c-f3649dcc95da @@ -0,0 +1 @@ +"CiIBVKhc7oDPpCaXyJKKssjqr4g3JNOSgJ/M2V+1THC1icsWCmwBVKhc7pBABbZ+zR3e9234WnWWS6GFXmf8IVwpnzjd5KYd7vyJbn/4vTorWBGayj/vbd9JPaZQjxdAIXhoE5mX/MDsQ7M9N/b0qJjHm39tYIBvS4sIWkMDHqTJqXGLzhhKtrTkfbV3RbaJEkQKmwEBVKhc7qVUgC3hfTXZLo9R3AJzUUIx50NKvJTb9B+UU+LBqgg7Nck1x5OpjWVS2R+SsveprIuYOruk2Y0H53J2OJF8qsxTdIq2si8DGW2V7WK8xyoJH5kbqd7drIw1jLb44b6lx4SMyB0VaULuTBki4d+Ljjg1tJTwR0IYMKqDLDZt9mheINsi0ZxcNjfpnDydRXdWbcSwzmK/wgqJAQFUqFzuKgNVElxs3cbO+xebr2IwcOro84nKTisi0tTp9bICPC9fTUhn3L+rvQWA+d3J1Za8at2bakrqiRj7BTh+CVO9fWQMAEQAs3ni0Z2hfaYG92tOD26E4IoZwyYEoWbfNudpH1fr5tEkyqnEGtWIh7H+XoZQ2DXeiOa+br7Zk88SrNE+trJMCogBAVSoXO5e9fBLg7hnbkmKsrzNLnQtLsQm1gNzjcjEC7nJYklYPp0KI2uGBE1PkM8XNsfllAfHVn7LzHcHNlbQ9pJ7QZTSIeG42goS971r5wNZwxaXwCTphClQh826eqJWo6A/28TtAVQWLhTx5ekbP7qb4nh1UblESZ1saxDQAEo4OKPbDzx5BgqKAQFUqFzuVyjNm5i0wN8hTDnKjfpDroEpPPTs531iFy9BOX+xDCdGHy8D+osFpaoBq6TFekQQbz4hIoUR1YEcP4zI80/cNimEeb9IcFxZTTxiNrbhbbcv0969DSMWhB+ZEqIz4vuw4GLe/xcUvqhlChQwFdgIbdOQHSHpatn5uDlktnP/bi26nKuXIwo0AVSoXO7US22OUH7d1f4abNPI0IyAvhqkPp12rbtWLx9vkOtojE8IP+xCfYtIFuZIzRNZqA==" } ] , "role" : "model" } , { "role" : "user" , "parts" : [ { "functionResponse" : { "name" : "getWeather" , "response" : { "response" : { "stringValue" : "Sunny and hot. 90 degrees Fahrenheit" } } } } ] } ] , # Remainder of request... Learn more about limitations and usage of thought signatures, and about thinking models in general, on the Thinking page. Parallel function calling In addition to single turn function calling, you can also call multiple functions at once. Parallel function calling lets you execute multiple functions at once and is used when the functions are not dependent on each other. This is useful in scenarios like gathering data from multiple independent sources, such as retrieving customer details from different databases or checking inventory levels across various warehouses or performing multiple actions such as converting your apartment into a disco. Python power_disco_ball = { "name" : "power_disco_ball" , "description" \ No newline at end of file diff --git a/docstore/dcc32a1c-6363-4f24-84f8-252d520f4e03 b/docstore/dcc32a1c-6363-4f24-84f8-252d520f4e03 new file mode 100644 index 0000000000000000000000000000000000000000..5f25eb2a53a9afab2cc27675039b1ff3f0e2b594 --- /dev/null +++ b/docstore/dcc32a1c-6363-4f24-84f8-252d520f4e03 @@ -0,0 +1 @@ +suitable for production use. Review ephemeral tokens guide for more information. Consider adding restrictions to your key: You can limit a key's permissions by adding API key restrictions . This minimizes the potential damage if the key is ever leaked. For some general best practices, you can also review this support article . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/dcc3613b-6bf4-4d62-8eae-7435e9b5369e b/docstore/dcc3613b-6bf4-4d62-8eae-7435e9b5369e new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/dcc3613b-6bf4-4d62-8eae-7435e9b5369e @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/dccaa688-9ef9-49fb-84a7-0e8efab93344 b/docstore/dccaa688-9ef9-49fb-84a7-0e8efab93344 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/dccaa688-9ef9-49fb-84a7-0e8efab93344 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/dcfc858d-0ca6-432e-a652-031087156605 b/docstore/dcfc858d-0ca6-432e-a652-031087156605 new file mode 100644 index 0000000000000000000000000000000000000000..54d5afc663364952ba885b308cb42db7820cf380 --- /dev/null +++ b/docstore/dcfc858d-0ca6-432e-a652-031087156605 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash-native-audio Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/dd25e6f0-0658-4cfc-91dc-ff4b2fb12aad b/docstore/dd25e6f0-0658-4cfc-91dc-ff4b2fb12aad new file mode 100644 index 0000000000000000000000000000000000000000..ddb7a9245d74e27120ebf722d781e6ffdbe95888 --- /dev/null +++ b/docstore/dd25e6f0-0658-4cfc-91dc-ff4b2fb12aad @@ -0,0 +1 @@ +"gemini-1.5-flash" ) imgData , err := os . ReadFile ( "path/to/organ.jpg" ) if err != nil { log . Fatal ( err ) } resp , err := model . GenerateContent ( ctx , genai . Text ( "Tell me about this instrument" ), genai . ImageData ( "jpeg" , imgData )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) // utility for printing response After Python Many of the same convenience features exist in the new SDK. For example, PIL.Image objects are automatically converted. from google import genai from PIL import Image client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = [ 'Tell me a story based on this image' , Image . open ( image_path ) ] ) print ( response . text ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const organ = await ai . files . upload ({ file : "path/to/organ.jpg" , }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : [ createUserContent ([ "Tell me a story based on this image" , createPartFromUri ( organ . uri , organ . mimeType ) ]), ], }); console . log ( response . text ); Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imgData , err := os . ReadFile ( "path/to/organ.jpg" ) if err != nil { log . Fatal ( err ) } parts := [] * genai . Part { { Text : "Tell me a story based on this image" }, { InlineData : & genai . Blob { Data : imgData , MIMEType : "image/jpeg" }}, } contents := [] * genai . Content { { Parts : parts }, } result , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , contents , nil ) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing result Streaming Before Python import google.generativeai as genai response = model . generate_content ( "Write a cute story about cats." , stream = True ) for chunk in response : print ( chunk . text ) \ No newline at end of file diff --git a/docstore/dd308263-ba37-45ef-8e8c-3390b9193305 b/docstore/dd308263-ba37-45ef-8e8c-3390b9193305 new file mode 100644 index 0000000000000000000000000000000000000000..696658722d20a37964165e2b0fc6d9de79b0d2b8 --- /dev/null +++ b/docstore/dd308263-ba37-45ef-8e8c-3390b9193305 @@ -0,0 +1 @@ +main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); You can enable transcription of the audio input by sending input_audio_transcription in setup config. Python import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ], "input_audio_transcription" : {}, } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_data = Path ( "16000.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_data , mime_type = 'audio/pcm;rate=16000' ) ) async for msg in session . receive (): if msg . server_content . input_transcription : print ( 'Transcript:' , msg . server_content . input_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ], inputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( \ No newline at end of file diff --git a/docstore/dd43d86f-6fa1-4a89-b7c5-cdcc0f1a6455 b/docstore/dd43d86f-6fa1-4a89-b7c5-cdcc0f1a6455 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/dd43d86f-6fa1-4a89-b7c5-cdcc0f1a6455 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/dd65eb11-21a2-498d-a736-046fe8af33b2 b/docstore/dd65eb11-21a2-498d-a736-046fe8af33b2 new file mode 100644 index 0000000000000000000000000000000000000000..9b6431ae9c97257d5ff4628dd401f203e2f83eb3 --- /dev/null +++ b/docstore/dd65eb11-21a2-498d-a736-046fe8af33b2 @@ -0,0 +1 @@ +Use curl to send a POST request to the predictLongRunning endpoint. # The request body includes the prompt for video generation. curl " ${ BASE_URL } /models/veo-2.0-generate-001:predictLongRunning" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -X "POST" \ -d '{ "instances": [{ "prompt": "Panning wide shot of a calico kitten sleeping in the sunshine" } ], "parameters": { "aspectRatio": "16:9", "personGeneration": "dont_allow", } }' | tee result.json | jq .name | sed 's/"//g' > op_name # Obtain operation name to download video. op_name = $( cat op_name ) # Check against status of operation. while true ; do is_done = $( curl -H "x-goog-api-key: $GEMINI_API_KEY " " ${ BASE_URL } / ${ op_name } " | tee op_check.json | jq .done ) if [ " ${ is_done } " = "true" ] ; then cat op_check.json echo "** Attach API_KEY to download video, or examine error message." break fi echo "** Video ${ op_name } has not downloaded yet! Check again after 5 seconds..." # Wait for 5 seoncds to check again. sleep 5 done This code takes about 2-3 minutes to run, though it may take longer if resources are constrained. Once it's done running, you should see a video that looks something like this: If you see an error message instead of a video, this means that resources are constrained and your request couldn't be completed. In this case, run the code again. Generated videos are stored on the server for 2 days, after which they are removed. If you want to save a local copy of your generated video, you must run result() and save() within 2 days of generation. Generate from images You can also generate videos using images. The following code generates an image using Imagen, then uses the generated image as the starting frame for the generated video. First, generate an image using Imagen : Python prompt = "Panning wide shot of a calico kitten sleeping in the sunshine" , imagen = client . models . generate_images ( model = "imagen-3.0-generate-002" , prompt = prompt , \ No newline at end of file diff --git a/docstore/dd8c57cb-a29e-4dad-988c-eb7f8b54495b b/docstore/dd8c57cb-a29e-4dad-988c-eb7f8b54495b new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/dd8c57cb-a29e-4dad-988c-eb7f8b54495b @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/dd8e43e6-a21f-4dc8-b7ae-79fe721bfce6 b/docstore/dd8e43e6-a21f-4dc8-b7ae-79fe721bfce6 new file mode 100644 index 0000000000000000000000000000000000000000..8d1e7e8a7a566917639b9515f69a698a37f27a8d --- /dev/null +++ b/docstore/dd8e43e6-a21f-4dc8-b7ae-79fe721bfce6 @@ -0,0 +1 @@ +to avoid. ⛔ Negative pattern: Example Don't end haikus with a question: Haiku are fun A short and simple poem Don't you enjoy them? ✅ Positive pattern: Example Always end haikus with an assertion: Haiku are fun A short and simple poem A joy to write Consistent formatting Make sure that the structure and formatting of few-shot examples are the same to avoid responses with undesired formats. One of the primary objectives of adding few-shot examples in prompts is to show the model the response format. Therefore, it is essential to ensure a consistent format across all examples, especially paying attention to XML tags, white spaces, newlines, and example splitters. Add context You can include instructions and information in a prompt that the model needs to solve a problem, instead of assuming that the model has all of the required information. This contextual information helps the model understand the constraints and details of what you're asking for it to do. The following example asks the model to give troubleshooting guidance for a router: Prompt: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Response: A yellow blinking light on your Google Wifi router usually indicates a problem with the internet connection. Here are some steps to troubleshoot and fix the issue: 1. Restart your router and modem: - Unplug your modem from the power outlet and wait for 30 seconds. - Plug your modem back in and wait for it to fully boot up. - Unplug your Google Wifi router and wait for 30 seconds. - Plug your Google Wifi router back in and wait for it to fully boot up. ..... (gemini-2.5-flash) The response looks like generic troubleshooting information that's not specific to the router or the status of the LED indicator lights. To customize the response for the specific router, you can add to the prompt the router's troubleshooting guide as context for it to refer to when providing a response. Prompt: Answer the question \ No newline at end of file diff --git a/docstore/dd9387cc-b35a-40cb-9ad8-bfc4ee409fae b/docstore/dd9387cc-b35a-40cb-9ad8-bfc4ee409fae new file mode 100644 index 0000000000000000000000000000000000000000..954b7c2f84cbdb4650dd5497451af99dc34fb5db --- /dev/null +++ b/docstore/dd9387cc-b35a-40cb-9ad8-bfc4ee409fae @@ -0,0 +1 @@ +GoogleGenAI ({}); let response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and make sure you get all 50." , ], config : { tools : [{ codeExecution : {} }], }, }); const parts = response ? . candidates ? .[ 0 ] ? . content ? . parts || []; parts . forEach (( part ) = > { if ( part . text ) { console . log ( part . text ); } if ( part . executableCode && part . executableCode . code ) { console . log ( part . executableCode . code ); } if ( part . codeExecutionResult && part . codeExecutionResult . output ) { console . log ( part . codeExecutionResult . output ); } }); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { Tools : [] * genai . Tool { { CodeExecution : & genai . ToolCodeExecution {}}, }, } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and make sure you get all 50." ), config , ) fmt . Println ( result . Text ()) fmt . Println ( result . ExecutableCode ()) fmt . Println ( result . CodeExecutionResult ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d ' {"tools": [{"code_execution": {}}], "contents": { "parts": { "text": "What is the sum of the first 50 prime numbers? Generate and run code for the calculation, and make sure you get all 50." } }, }' Note: This REST example doesn't parse the JSON response as shown in the example output. The output might look something like the following, which has been formatted for readability: Okay, I need to calculate \ No newline at end of file diff --git a/docstore/dd97be3a-e1e1-402b-82f1-4b9a3cbbeac5 b/docstore/dd97be3a-e1e1-402b-82f1-4b9a3cbbeac5 new file mode 100644 index 0000000000000000000000000000000000000000..ba9fb868f99e81b20779165b803150afffeabaec --- /dev/null +++ b/docstore/dd97be3a-e1e1-402b-82f1-4b9a3cbbeac5 @@ -0,0 +1 @@ +Generate video using Veo | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Generate video using Veo The Gemini API provides access to Veo 2 , Google's most capable video generation model to date. Veo generates videos in a wide range of cinematic and visual styles, capturing prompt nuance to render intricate details consistently across frames. This guide will help you get started with Veo using the Gemini API. For video prompting guidance, check out the Veo prompt guide section. Note: Veo is a paid feature and will not run in the Free tier. Visit the Pricing page for more details. Before you begin Before calling the Gemini API, ensure you have your SDK of choice installed, and a Gemini API key configured and ready to use. To use Veo with the Google Gen AI SDKs, ensure that you have one of the following versions installed: Python v1.10.0 or later TypeScript and JavaScript v0.8.0 or later Go v1.0.0 or later Generate videos This section provides code examples for generating videos using text prompts and using images . Generate from text You can use the following code to generate videos with Veo: Python import time from google import genai from google.genai import types client = genai . Client () operation = client . models . generate_videos ( model = "veo-2.0-generate-001" , prompt = "Panning wide shot of a calico kitten sleeping in the sunshine" , config = types . GenerateVideosConfig ( person_generation = "dont_allow" , # "dont_allow" or "allow_adult" aspect_ratio = "16:9" , # "16:9" or "9:16" ), ) while not operation . done : time . sleep ( 20 ) operation = client . operations . get ( operation ) for n , generated_video in enumerate ( \ No newline at end of file diff --git a/docstore/ddaaea32-bc66-40f1-9e47-ed83317df174 b/docstore/ddaaea32-bc66-40f1-9e47-ed83317df174 new file mode 100644 index 0000000000000000000000000000000000000000..1dc556e1b0caa0a5554ef0b35478c8c26ac17b7d --- /dev/null +++ b/docstore/ddaaea32-bc66-40f1-9e47-ed83317df174 @@ -0,0 +1 @@ +print ( response . text ) JavaScript and TypeScript You can access both Gemini Developer API and Vertex AI services through @google/genai library. See libraries page for instructions on how to install @google/genai . Gemini Developer API import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Explain how AI works in a few words" , }); console . log ( response . text ); } main (); Vertex AI Gemini API import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ vertexai : true , project : 'your_project' , location : 'your_location' , }); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Explain how AI works in a few words" , }); console . log ( response . text ); } main (); Go You can access both Gemini Developer API and Vertex AI services through google.golang.org/genai library. See libraries page for instructions on how to install google.golang.org/genai . Gemini Developer API import ( "context" "encoding/json" "fmt" "log" "google.golang.org/genai" ) // Your Google API key const apiKey = "your-api-key" func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Call the GenerateContent method. result , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , genai . Text ( "Tell me about New York?" ), nil ) } Vertex AI Gemini API import ( "context" "encoding/json" "fmt" "log" "google.golang.org/genai" ) // Your GCP project const project = "your-project" // A GCP location like "us-central1" const location = "some-gcp-location" func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , & genai . ClientConfig { Project : project , Location : location , Backend : genai . BackendVertexAI , }) // Call the GenerateContent method. \ No newline at end of file diff --git a/docstore/ddaf39ee-60f0-4814-9a4a-e5ece5d4b9df b/docstore/ddaf39ee-60f0-4814-9a4a-e5ece5d4b9df new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/ddaf39ee-60f0-4814-9a4a-e5ece5d4b9df @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/ddc2504a-6ea1-444d-8fc7-82c417132f34 b/docstore/ddc2504a-6ea1-444d-8fc7-82c417132f34 new file mode 100644 index 0000000000000000000000000000000000000000..42fbfa8d3a1b9c27b4f54909cff17ace224a9de6 --- /dev/null +++ b/docstore/ddc2504a-6ea1-444d-8fc7-82c417132f34 @@ -0,0 +1 @@ +over a happy ' 'futuristic scifi city with lots of greenery?' ) response = client . models . generate_content ( model = "gemini-2.0-flash-preview-image-generation" , contents = contents , config = types . GenerateContentConfig ( response_modalities = [ 'TEXT' , 'IMAGE' ] ) ) for part in response . candidates [ 0 ] . content . parts : if part . text is not None : print ( part . text ) elif part . inline_data is not None : image = Image . open ( BytesIO (( part . inline_data . data ))) image . save ( 'gemini-native-image.png' ) image . show () JavaScript Note: We've released the Google SDK for TypeScript and JavaScript in preview launch stage . Use this SDK for image generation features. import { GoogleGenAI , Modality } from "@google/genai" ; import * as fs from "node:fs" ; async function main () { const ai = new GoogleGenAI ({}); const contents = "Hi, can you create a 3d rendered image of a pig " + "with wings and a top hat flying over a happy " + "futuristic scifi city with lots of greenery?" ; // Set responseModalities to include "Image" so the model can generate an image const response = await ai . models . generateContent ({ model : "gemini-2.0-flash-preview-image-generation" , contents : contents , config : { responseModalities : [ Modality . TEXT , Modality . IMAGE ], }, }); for ( const part of response . candidates [ 0 ]. content . parts ) { // Based on the part type, either show the text or save the image if ( part . text ) { console . log ( part . text ); } else if ( part . inlineData ) { const imageData = part . inlineData . data ; const buffer = Buffer . from ( imageData , "base64" ); fs . writeFileSync ( "gemini-native-image.png" , buffer ); console . log ( "Image saved as gemini-native-image.png" ); } } } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . \ No newline at end of file diff --git a/docstore/ddc9718f-f7b0-47e4-927b-a132052bfb89 b/docstore/ddc9718f-f7b0-47e4-927b-a132052bfb89 new file mode 100644 index 0000000000000000000000000000000000000000..a99dcaa8b13f3b83504ad1b35c796f8fc297615c --- /dev/null +++ b/docstore/ddc9718f-f7b0-47e4-927b-a132052bfb89 @@ -0,0 +1 @@ +] # Execute the prompt with specified tools in audio modality await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Multiple tasks example - combining lights, code execution, and search const prompt = ` Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! ` ; const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } // not defined here. ]; // Execute the prompt with specified tools in audio modality await run ( prompt , { tools : tools , modality : "AUDIO" }); Python developers can try this out in the Live API Tool Use notebook . Model context protocol (MCP) Model Context Protocol (MCP) is an open standard for connecting AI applications with external tools and data. MCP provides a common protocol for models to access context, such as functions (tools), data sources (resources), or predefined prompts. The Gemini SDKs have built-in support for the MCP, reducing boilerplate code and offering automatic tool calling for MCP tools. When the model generates an MCP tool call, the Python and JavaScript client SDK can automatically execute the MCP tool and send the response back to the model in a subsequent request, continuing this loop until no more tool calls are made by the model. Here, you can find an example of how to use a local MCP server with Gemini and mcp SDK. Python Make sure the latest version of the mcp SDK is installed on your platform of choice. pip install mcp Note: Python supports automatic tool calling by passing in the ClientSession into the tools parameters. If you want to disable it, you can provide automatic_function_calling with disabled True . import os import asyncio from datetime import datetime from mcp import ClientSession , StdioServerParameters from \ No newline at end of file diff --git a/docstore/ddd11c18-36cc-44e2-b910-59193e5b4c55 b/docstore/ddd11c18-36cc-44e2-b910-59193e5b4c55 new file mode 100644 index 0000000000000000000000000000000000000000..1b38806cd93f51329872622740eee5221d7a7d7e --- /dev/null +++ b/docstore/ddd11c18-36cc-44e2-b910-59193e5b4c55 @@ -0,0 +1 @@ +18°C." , config = config , ) # Print the final, user-facing response print ( response . text ) Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . I 've set the thermostat to 20°C. JavaScript This example shows how to use JavaScript/TypeScript SDK to do comopositional function calling using a manual execution loop. import { GoogleGenAI , Type } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Example Functions function get_weather_forecast ({ location }) { console . log ( `Tool Call: get_weather_forecast(location= ${ location } )` ); // TODO: Make API call console . log ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ); return { temperature : 25 , unit : "celsius" }; } function set_thermostat_temperature ({ temperature }) { console . log ( `Tool Call: set_thermostat_temperature(temperature= ${ temperature } )` , ); // TODO: Make API call console . log ( "Tool Response: {'status': 'success'}" ); return { status : "success" }; } const toolFunctions = { get_weather_forecast , set_thermostat_temperature , }; const tools = [ { functionDeclarations : [ { name : "get_weather_forecast" , description : "Gets the current weather temperature for a given location." , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , }, }, required : [ "location" ], }, }, { name : "set_thermostat_temperature" , description : "Sets the thermostat to a desired temperature." , parameters : { type : Type . OBJECT , properties : { temperature : { type : Type . NUMBER , }, }, required : [ \ No newline at end of file diff --git a/docstore/dddc9e09-b0d8-415d-aefa-7db3de4106f8 b/docstore/dddc9e09-b0d8-415d-aefa-7db3de4106f8 new file mode 100644 index 0000000000000000000000000000000000000000..2dd19cc1c2d77863f4a301d62456871308dd74fd --- /dev/null +++ b/docstore/dddc9e09-b0d8-415d-aefa-7db3de4106f8 @@ -0,0 +1 @@ +"temperature" ], }, }, ], }, ]; // Prompt for the model let contents = [ { role : "user" , parts : [ { text : "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , }, ], }, ]; // Loop until the model has no more function calls to make while ( true ) { const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents , config : { tools }, }); if ( result . functionCalls && result . functionCalls . length > 0 ) { const functionCall = result . functionCalls [ 0 ]; const { name , args } = functionCall ; if ( ! toolFunctions [ name ]) { throw new Error ( `Unknown function call: ${ name } ` ); } // Call the function and get the response. const toolResponse = toolFunctions [ name ]( args ); const functionResponsePart = { name : functionCall . name , response : { result : toolResponse , }, }; // Send the function response back to the model. contents . push ({ role : "model" , parts : [ { functionCall : functionCall , }, ], }); contents . push ({ role : "user" , parts : [ { functionResponse : functionResponsePart , }, ], }); } else { // No more function calls, break the loop. console . log ( result . text ); break ; } } Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . It 's 25°C in London, so I' ve set the thermostat to 20 ° C . Compositional function calling is a native Live API feature. This means Live API can handle the function calling similar to the Python SDK. Python # Light control schemas turn_on_the_lights_schema = { 'name' : 'turn_on_the_lights' } \ No newline at end of file diff --git a/docstore/dde7b6a1-197f-44f8-9491-d1ebad9c6738 b/docstore/dde7b6a1-197f-44f8-9491-d1ebad9c6738 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/dde7b6a1-197f-44f8-9491-d1ebad9c6738 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/de068302-5bc6-44b9-b719-af672c540aa3 b/docstore/de068302-5bc6-44b9-b719-af672c540aa3 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/de068302-5bc6-44b9-b719-af672c540aa3 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/de141739-442f-4170-8415-30fa1b546212 b/docstore/de141739-442f-4170-8415-30fa1b546212 new file mode 100644 index 0000000000000000000000000000000000000000..2c3da71d27e65ed84712a0110484968779786f60 --- /dev/null +++ b/docstore/de141739-442f-4170-8415-30fa1b546212 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/video#basics Title: Generate video using Veo | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/de1b9ce5-4d11-4db8-9a58-fe6967f58759 b/docstore/de1b9ce5-4d11-4db8-9a58-fe6967f58759 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/de1b9ce5-4d11-4db8-9a58-fe6967f58759 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/de224d5f-cf8c-4ec3-b189-2a7aae08a661 b/docstore/de224d5f-cf8c-4ec3-b189-2a7aae08a661 new file mode 100644 index 0000000000000000000000000000000000000000..b02538f85c1e26824fb9d15e124ac354f46dfed1 --- /dev/null +++ b/docstore/de224d5f-cf8c-4ec3-b189-2a7aae08a661 @@ -0,0 +1 @@ +temperature : int ) - > dict : """Sets the thermostat to a desired temperature.""" print ( f "Tool Call: set_thermostat_temperature(temperature= { temperature } )" ) # TODO: Interact with a thermostat API print ( "Tool Response: {'status': 'success'}" ) return { "status" : "success" } # Configure the client and model client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_weather_forecast , set_thermostat_temperature ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , config = config , ) # Print the final, user-facing response print ( response . text ) Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . I 've set the thermostat to 20°C. JavaScript This example shows how to use JavaScript/TypeScript SDK to do comopositional function calling using a manual execution loop. import { GoogleGenAI , Type } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Example Functions function get_weather_forecast ({ location }) { console . log ( `Tool Call: get_weather_forecast(location= ${ location } )` ); // TODO: Make API call console . log ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ); return { temperature : 25 , unit : "celsius" }; } function set_thermostat_temperature ({ temperature }) { console . log ( `Tool Call: set_thermostat_temperature(temperature= ${ temperature } )` , ); // TODO: Make API call console . log ( "Tool \ No newline at end of file diff --git a/docstore/de4f128c-41df-43ee-96ff-bfdf938b1eab b/docstore/de4f128c-41df-43ee-96ff-bfdf938b1eab new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/de4f128c-41df-43ee-96ff-bfdf938b1eab @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/de592198-6c88-4e02-be5c-83a60891ece3 b/docstore/de592198-6c88-4e02-be5c-83a60891ece3 new file mode 100644 index 0000000000000000000000000000000000000000..a93f111d87cca3d0226fc4cdf96775703aa03950 --- /dev/null +++ b/docstore/de592198-6c88-4e02-be5c-83a60891ece3 @@ -0,0 +1 @@ +genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMAGE_PATH = "path/to/sample.jpg" MIME_TYPE = $( file -b --mime-type " ${ IMAGE_PATH } " ) NUM_BYTES = $( wc -c < " ${ IMAGE_PATH } " ) DISPLAY_NAME = IMAGE tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Caption this image."}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Prompting with multiple images You can provide multiple images in a \ No newline at end of file diff --git a/docstore/de8c4e7d-ed19-4261-968c-de379b4da3ce b/docstore/de8c4e7d-ed19-4261-968c-de379b4da3ce new file mode 100644 index 0000000000000000000000000000000000000000..ff5e8b2b2161bd7580154c5132c11171d6f05cff --- /dev/null +++ b/docstore/de8c4e7d-ed19-4261-968c-de379b4da3ce @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-1.5-flash Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/dea23277-41d3-450a-bad5-98bffb48590f b/docstore/dea23277-41d3-450a-bad5-98bffb48590f new file mode 100644 index 0000000000000000000000000000000000000000..a99dcaa8b13f3b83504ad1b35c796f8fc297615c --- /dev/null +++ b/docstore/dea23277-41d3-450a-bad5-98bffb48590f @@ -0,0 +1 @@ +] # Execute the prompt with specified tools in audio modality await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Multiple tasks example - combining lights, code execution, and search const prompt = ` Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! ` ; const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } // not defined here. ]; // Execute the prompt with specified tools in audio modality await run ( prompt , { tools : tools , modality : "AUDIO" }); Python developers can try this out in the Live API Tool Use notebook . Model context protocol (MCP) Model Context Protocol (MCP) is an open standard for connecting AI applications with external tools and data. MCP provides a common protocol for models to access context, such as functions (tools), data sources (resources), or predefined prompts. The Gemini SDKs have built-in support for the MCP, reducing boilerplate code and offering automatic tool calling for MCP tools. When the model generates an MCP tool call, the Python and JavaScript client SDK can automatically execute the MCP tool and send the response back to the model in a subsequent request, continuing this loop until no more tool calls are made by the model. Here, you can find an example of how to use a local MCP server with Gemini and mcp SDK. Python Make sure the latest version of the mcp SDK is installed on your platform of choice. pip install mcp Note: Python supports automatic tool calling by passing in the ClientSession into the tools parameters. If you want to disable it, you can provide automatic_function_calling with disabled True . import os import asyncio from datetime import datetime from mcp import ClientSession , StdioServerParameters from \ No newline at end of file diff --git a/docstore/deb4b245-c8e4-4a34-8f8c-50f8333c4b6e b/docstore/deb4b245-c8e4-4a34-8f8c-50f8333c4b6e new file mode 100644 index 0000000000000000000000000000000000000000..df0e1f9fb0a005441553bb4be8975eaf5201dec5 --- /dev/null +++ b/docstore/deb4b245-c8e4-4a34-8f8c-50f8333c4b6e @@ -0,0 +1 @@ +Document understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Document understanding Gemini models can process documents in PDF format, using native vision to understand entire document contexts. This goes beyond simple text extraction, allowing Gemini to: Analyze and interpret content, including text, images, diagrams, charts, and tables, even in long documents up to 1000 pages. Extract information into structured output formats. Summarize and answer questions based on both the visual and textual elements in a document. Transcribe document content (e.g. to HTML), preserving layouts and formatting, for use in downstream applications. Passing inline PDF data You can pass inline PDF data in the request to generateContent . For PDF payloads under 20MB, you can choose between uploading base64 encoded documents or directly uploading locally stored files. The following example shows you how to fetch a PDF from a URL and convert it to bytes for processing: Python from google import genai from google.genai import types import httpx client = genai . Client () doc_url = "https://discovery.ucl.ac.uk/id/eprint/10089234/1/343019_3_art_0_py4t4l_convrt.pdf" # Retrieve and encode the PDF byte doc_data = httpx . get ( doc_url ) . content prompt = "Summarize this document" response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ types . Part . from_bytes ( data = doc_data , mime_type = 'application/pdf' , ), prompt ]) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); async function main () { const pdfResp = await fetch \ No newline at end of file diff --git a/docstore/deb83f80-2216-4469-bd16-83d65519a496 b/docstore/deb83f80-2216-4469-bd16-83d65519a496 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/deb83f80-2216-4469-bd16-83d65519a496 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/decbb514-c57b-437e-aa5c-ffd62a1f9373 b/docstore/decbb514-c57b-437e-aa5c-ffd62a1f9373 new file mode 100644 index 0000000000000000000000000000000000000000..203c1e44cd8be8f0144d13a11f43fb822930ab15 --- /dev/null +++ b/docstore/decbb514-c57b-437e-aa5c-ffd62a1f9373 @@ -0,0 +1 @@ +] // MCP Server }); const client = new Client ( { name : "example-client" , version : "1.0.0" } ); // Configure the client const ai = new GoogleGenAI ({}); // Initialize the connection between client and server await client . connect ( serverParams ); // Send request to the model with MCP tools const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : `What is the weather in London in ${ new Date (). toLocaleDateString () } ?` , config : { tools : [ mcpToTool ( client )], // uses the session, will automatically call the tool // Uncomment if you **don't** want the sdk to automatically call the tool // automaticFunctionCalling: { // disable: true, // }, }, }); console . log ( response . text ) // Close the connection await client . close (); Limitations with built-in MCP support Built-in MCP support is a experimental feature in our SDKs and has the following limitations: Only tools are supported, not resources nor prompts It is available for the Python and JavaScript/TypeScript SDK. Breaking changes might occur in future releases. Manual integration of MCP servers is always an option if these limit what you're building. Supported models This section lists models and their function calling capabilities. Experimental models are not included. You can find a comprehensive capabilities overview on the model overview page. Model Function Calling Parallel Function Calling Compositional Function Calling Gemini 2.5 Pro ✔️ ✔️ ✔️ Gemini 2.5 Flash ✔️ ✔️ ✔️ Gemini 2.5 Flash-Lite ✔️ ✔️ ✔️ Gemini 2.0 Flash ✔️ ✔️ ✔️ Gemini 2.0 Flash-Lite X X X Best practices Function and Parameter Descriptions: Be extremely clear and specific in your descriptions. The model relies on these to choose the correct function and provide appropriate arguments. Naming: Use descriptive function names (without spaces, periods, or dashes). Strong Typing: Use specific types (integer, string, enum) for parameters to reduce errors. If a parameter has a limited set of valid \ No newline at end of file diff --git a/docstore/dede1408-db04-4e97-92ff-d464329b2e45 b/docstore/dede1408-db04-4e97-92ff-d464329b2e45 new file mode 100644 index 0000000000000000000000000000000000000000..54ff3139001cad531cb76ca5ae25b2688a321ffa --- /dev/null +++ b/docstore/dede1408-db04-4e97-92ff-d464329b2e45 @@ -0,0 +1 @@ +angle," "worms eye," "dolly shot," "zoom shot," "pan shot," and "tracking shot." Focus and lens effects: Use terms like "shallow focus," "deep focus," "soft focus," "macro lens," and "wide-angle lens" to achieve specific visual effects. Overall style and subject: Guide Veo's creative direction by specifying styles like "sci-fi," "romantic comedy," "action movie," or "animation." You can also describe the subjects and backgrounds you want, such as "cityscape," "nature," "vehicles," or "animals." Veo prompt guide This section of the Veo guide contains examples of videos you can create using Veo, and shows you how to modify prompts to produce distinct results. Safety filters Veo applies safety filters across Gemini to help ensure that generated videos and uploaded photos don't contain offensive content. Prompts that violate our terms and guidelines are blocked. Prompt writing basics Good prompts are descriptive and clear. To get your generated video as close as possible to what you want, start with identifying your core idea, and then refine your idea by adding keywords and modifiers. The following elements should be included in your prompt: Subject : The object, person, animal, or scenery that you want in your video. Context : The background or context in which the subject is placed. Action : What the subject is doing (for example, walking , running , or turning their head ). Style : This can be general or very specific. Consider using specific film style keywords, such as horror film , film noir , or animated styles like cartoon style. Camera motion : [Optional] What the camera is doing, such as aerial view , eye-level , top-down shot , or low-angle shot . Composition : [Optional] How the shot is framed, such as wide shot , close-up , or extreme close-up . Ambiance : [Optional] How the color and light contribute to the scene, such as blue tones , night , or warm tones . More tips for writing prompts The following tips help you write prompts that generate your videos: \ No newline at end of file diff --git a/docstore/dee1cb2c-50cb-4f2a-be8b-a62ef9fc8e23 b/docstore/dee1cb2c-50cb-4f2a-be8b-a62ef9fc8e23 new file mode 100644 index 0000000000000000000000000000000000000000..eb9021b0c62d8225572ffa88a8581263b0a8ad78 --- /dev/null +++ b/docstore/dee1cb2c-50cb-4f2a-be8b-a62ef9fc8e23 @@ -0,0 +1 @@ +and their capabilities, visit the Models page. Best practices Prompting tips For basic text generation, a zero-shot prompt often suffices without needing examples, system instructions or specific formatting. For more tailored outputs: Use System instructions to guide the model. Provide few example inputs and outputs to guide the model. This is often referred to as few-shot prompting. Consult our prompt engineering guide for more tips. Structured output In some cases, you may need structured output, such as JSON. Refer to our structured output guide to learn how. What's next Try the Gemini API getting started Colab . Explore Gemini's image , video , audio and document understanding capabilities. Learn about multimodal file prompting strategies . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/defedb64-74c6-404f-8563-8ce83e3af2ca b/docstore/defedb64-74c6-404f-8563-8ce83e3af2ca new file mode 100644 index 0000000000000000000000000000000000000000..d8377d018b3db5f7ee9855aef65f187ecbbeaa66 --- /dev/null +++ b/docstore/defedb64-74c6-404f-8563-8ce83e3af2ca @@ -0,0 +1 @@ +print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works in a few words" , config : { thinkingConfig : { thinkingBudget : 0 , // Disables thinking }, } }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : int32 ( 0 ), // Disables thinking }, } ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works in a few words" } ] } ] "generationConfig": { "thinkingConfig": { "thinkingBudget": 0 } } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'Explain how AI works in a few words' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ \ No newline at end of file diff --git a/docstore/df0856b5-e372-46dd-bf64-978567d5a7cf b/docstore/df0856b5-e372-46dd-bf64-978567d5a7cf new file mode 100644 index 0000000000000000000000000000000000000000..0d6f2bf32b918d3f85636fdb1c53070e3d5509f6 --- /dev/null +++ b/docstore/df0856b5-e372-46dd-bf64-978567d5a7cf @@ -0,0 +1 @@ +overlay_filename = f " { item [ 'label' ] } _ { i } _overlay.png" mask . save ( os . path . join ( output_dir , mask_filename )) # Create and save overlay composite = Image . alpha_composite ( im . convert ( 'RGBA' ), overlay ) composite . save ( os . path . join ( output_dir , overlay_filename )) print ( f "Saved mask and overlay for { item [ 'label' ] } to { output_dir } " ) # Example usage if __name__ == "__main__" : extract_segmentation_masks ( "path/to/image.png" ) Check the segmentation example in the cookbook guide for a more detailed example. An example segmentation output with objects and segmentation masks Supported image formats Gemini supports the following image format MIME types: PNG - image/png JPEG - image/jpeg WEBP - image/webp HEIC - image/heic HEIF - image/heif Capabilities All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation. Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements. Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities: Gemini 2.0 models are further trained to support enhanced object detection . Gemini 2.5 models are further trained to support enhanced segmentation in addition to object detection . Limitations and key technical information File limit Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request. Token calculation Gemini 1.5 Flash and Gemini 1.5 Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens. Gemini 2.0 Flash and Gemini 2.5 Flash/Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each \ No newline at end of file diff --git a/docstore/df0ed19f-1bd0-4e6d-b206-fcbe8e57c728 b/docstore/df0ed19f-1bd0-4e6d-b206-fcbe8e57c728 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/df0ed19f-1bd0-4e6d-b206-fcbe8e57c728 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/df1b5ca3-ef4b-47eb-bf07-823346a1434c b/docstore/df1b5ca3-ef4b-47eb-bf07-823346a1434c new file mode 100644 index 0000000000000000000000000000000000000000..203c1e44cd8be8f0144d13a11f43fb822930ab15 --- /dev/null +++ b/docstore/df1b5ca3-ef4b-47eb-bf07-823346a1434c @@ -0,0 +1 @@ +] // MCP Server }); const client = new Client ( { name : "example-client" , version : "1.0.0" } ); // Configure the client const ai = new GoogleGenAI ({}); // Initialize the connection between client and server await client . connect ( serverParams ); // Send request to the model with MCP tools const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : `What is the weather in London in ${ new Date (). toLocaleDateString () } ?` , config : { tools : [ mcpToTool ( client )], // uses the session, will automatically call the tool // Uncomment if you **don't** want the sdk to automatically call the tool // automaticFunctionCalling: { // disable: true, // }, }, }); console . log ( response . text ) // Close the connection await client . close (); Limitations with built-in MCP support Built-in MCP support is a experimental feature in our SDKs and has the following limitations: Only tools are supported, not resources nor prompts It is available for the Python and JavaScript/TypeScript SDK. Breaking changes might occur in future releases. Manual integration of MCP servers is always an option if these limit what you're building. Supported models This section lists models and their function calling capabilities. Experimental models are not included. You can find a comprehensive capabilities overview on the model overview page. Model Function Calling Parallel Function Calling Compositional Function Calling Gemini 2.5 Pro ✔️ ✔️ ✔️ Gemini 2.5 Flash ✔️ ✔️ ✔️ Gemini 2.5 Flash-Lite ✔️ ✔️ ✔️ Gemini 2.0 Flash ✔️ ✔️ ✔️ Gemini 2.0 Flash-Lite X X X Best practices Function and Parameter Descriptions: Be extremely clear and specific in your descriptions. The model relies on these to choose the correct function and provide appropriate arguments. Naming: Use descriptive function names (without spaces, periods, or dashes). Strong Typing: Use specific types (integer, string, enum) for parameters to reduce errors. If a parameter has a limited set of valid \ No newline at end of file diff --git a/docstore/df2a8fdf-d5f9-4ece-a163-b8244284a09c b/docstore/df2a8fdf-d5f9-4ece-a163-b8244284a09c new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/df2a8fdf-d5f9-4ece-a163-b8244284a09c @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/df2cf4ac-1609-4782-b1d0-074532d39173 b/docstore/df2cf4ac-1609-4782-b1d0-074532d39173 new file mode 100644 index 0000000000000000000000000000000000000000..5517c0bf6b0252fb1080acbdd22b2b409d2663d8 --- /dev/null +++ b/docstore/df2cf4ac-1609-4782-b1d0-074532d39173 @@ -0,0 +1 @@ +signatures, function call and result of the function execution to contents function_call_content = response . candidates [ 0 ] . content # Append the model's function call message, which includes thought signatures contents . append ( function_call_content ) contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Step 4: Create user friendly response with function result and call the model again // ...Create a function response part (No change) // Append thought signatures, function call and result of the function execution to contents const function_response_content = response . candidates [ 0 ]. content ; contents . push ( function_response_content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); The following shows what a request returning a thought signature may look like: [{ "contents" : [ { "role" : "user" , "parts" : [ { "text" : "what is the weather in Lake Tahoe?" } ] } , { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/df2f7938-e639-47f6-a493-65d0724026cc b/docstore/df2f7938-e639-47f6-a493-65d0724026cc new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/df2f7938-e639-47f6-a493-65d0724026cc @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/df435072-8235-49fc-a977-1bd73becef8d b/docstore/df435072-8235-49fc-a977-1bd73becef8d new file mode 100644 index 0000000000000000000000000000000000000000..2426e4316b986fa0eda84ec610a81c084b69e3a5 --- /dev/null +++ b/docstore/df435072-8235-49fc-a977-1bd73becef8d @@ -0,0 +1 @@ += genai . embed_content ( model = 'models/text-embedding-004' , content = 'Hello world' ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "text-embedding-004" , }); const result = await model . embedContent ( "Hello world!" ); console . log ( result . embedding ); After Python from google import genai client = genai . Client () response = client . models . embed_content ( model = 'text-embedding-004' , contents = 'Hello world' , ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const text = "Hello World!" ; const result = await ai . models . embedContent ({ model : "text-embedding-004" , contents : text , config : { outputDimensionality : 10 }, }); console . log ( result . embeddings ); Tune a Model Create and use a tuned model. The new SDK simplifies tuning with client.tunings.tune , which launches the tuning job and polls until the job is complete. Before Python import google.generativeai as genai import random # create tuning model train_data = {} for i in range ( 1 , 6 ): key = f 'input { i } ' value = f 'output { i } ' train_data [ key ] = value name = f 'generate-num- { random . randint ( 0 , 10000 ) } ' operation = genai . create_tuned_model ( source_model = 'models/gemini-1.5-flash-001-tuning' , training_data = train_data , id = name , epoch_count = 5 , batch_size = 4 , learning_rate = 0.001 , ) # wait for tuning complete tuningProgress = operation . result () # generate content with the tuned model model = genai . GenerativeModel ( model_name = f 'tunedModels/ { name } ' ) response = model . generate_content ( '55' ) After Python from google import genai from google.genai import types client = genai . Client () # Check which models are available for tuning. for m in client . models . list (): for action in m . supported_actions : if action == \ No newline at end of file diff --git a/docstore/df523304-77ca-4d66-a03e-a92213d92942 b/docstore/df523304-77ca-4d66-a03e-a92213d92942 new file mode 100644 index 0000000000000000000000000000000000000000..36b0f0f8a4df60acd9dd94249f5fced4282af350 --- /dev/null +++ b/docstore/df523304-77ca-4d66-a03e-a92213d92942 @@ -0,0 +1 @@ +Get started with Live API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Get started with Live API Preview: The Live API is in preview. The Live API enables low-latency, real-time voice and video interactions with Gemini. It processes continuous streams of audio, video, or text to deliver immediate, human-like spoken responses, creating a natural conversational experience for your users. Live API offers a comprehensive set of features such as Voice Activity Detection , tool use and function calling , session management (for managing long running conversations) and ephemeral tokens (for secure client-sided authentication). This page gets you up and running with examples and basic code samples. Example applications Check out the following example applications that illustrate how to use Live API for end-to-end use cases: Live audio starter app on AI Studio, using JavaScript libraries to connect to Live API and stream bidirectional audio through your microphone and speakers. Live API Python cookbook using Pyaudio that connects to Live API. Partner integrations If you prefer a simpler development process, you can use Daily or LiveKit . These are third-party partner platforms that have already integrated the Gemini Live API over the WebRTC protocol to streamline the development of real-time audio and video applications. Before you begin building There are two important decisions to make before you begin building with the Live API: choosing a model and choosing an implementation approach. Choose an audio generation architecture If you're building an audio-based use case, your choice of model determines the audio generation architecture \ No newline at end of file diff --git a/docstore/df6b9abd-47ce-40ae-bcf1-3ebc1e6699aa b/docstore/df6b9abd-47ce-40ae-bcf1-3ebc1e6699aa new file mode 100644 index 0000000000000000000000000000000000000000..6b1a11d386f4b560f93e6fc6fce6c7f46a05bdf0 --- /dev/null +++ b/docstore/df6b9abd-47ce-40ae-bcf1-3ebc1e6699aa @@ -0,0 +1 @@ +responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } else if ( message . toolCall ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Compute the largest prime palindrome under 100000.' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . modelTurn && turn . serverContent . modelTurn . parts ) { for ( const part of turn . serverContent . modelTurn . parts ) { if ( part . text ) { console . debug ( 'Received text: %s\n' , part . text ); } else if ( part . executableCode ) { console . debug ( 'executableCode: %s\n' , part . executableCode . code ); } else if ( part . codeExecutionResult ) { console . debug ( 'codeExecutionResult: %s\n' , part . codeExecutionResult . output ); } } } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Grounding with Google Search You can enable Grounding with Google Search as part of the session configuration. This increases the Live API's accuracy and prevents hallucinations. See the Grounding tutorial to learn more. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = \ No newline at end of file diff --git a/docstore/df749208-3285-4694-b319-d6cea07e05ac b/docstore/df749208-3285-4694-b319-d6cea07e05ac new file mode 100644 index 0000000000000000000000000000000000000000..b3339b694e68c4d7176324567b6e6d7542786980 --- /dev/null +++ b/docstore/df749208-3285-4694-b319-d6cea07e05ac @@ -0,0 +1 @@ +YouTube video per day. For the paid tier, there is no limit based on video length. For models before 2.5, you can upload only 1 video per request. For models after 2.5, you can upload a maximum of 10 videos per request. You can only upload public videos (not private or unlisted videos). The following example shows how to include a YouTube URL with a prompt: Python response = client . models . generate_content ( model = 'models/gemini-2.0-flash' , contents = types . Content ( parts = [ types . Part ( file_data = types . FileData ( file_uri = 'https://www.youtube.com/watch?v=9hE5-98ZeCg' ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( process . env . GOOGLE_API_KEY ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-pro" }); const result = await model . generateContent ([ "Please summarize the video in 3 sentences." , { fileData : { fileUri : "https://www.youtube.com/watch?v=9hE5-98ZeCg" , }, }, ]); console . log ( result . response . text ()); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } parts := [] * genai . Part { genai . NewPartFromText ( "Please summarize the video in 3 sentences." ), genai . NewPartFromURI ( "https://www.youtube.com/watch?v=9hE5-98ZeCg" , "video/mp4" ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Please summarize the video \ No newline at end of file diff --git a/docstore/df77902b-98dd-41af-a9df-c1f0408b4c49 b/docstore/df77902b-98dd-41af-a9df-c1f0408b4c49 new file mode 100644 index 0000000000000000000000000000000000000000..750217269b5a3e53dd2d92de822b68418a6799df --- /dev/null +++ b/docstore/df77902b-98dd-41af-a9df-c1f0408b4c49 @@ -0,0 +1 @@ +' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Get metadata for a file You can verify that the API successfully stored the uploaded file and get its metadata by calling files.get . Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) file_name = myfile . name myfile = client . files . get ( name = file_name ) print ( myfile ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; const fetchedFile = await ai . files . get ({ name : fileName }); console . log ( fetchedFile ); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } gotFile , err := client . GetFile ( ctx , file . Name ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Got file:" , gotFile . Name ) REST # file_info.json was created in the upload example name = $( jq ".file.name" file_info.json ) # Get the file of interest to check state curl https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " > file_info.json # Print some information about the file you got name = $( jq ".file.name" file_info.json ) echo name = $name file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri List uploaded files You can upload multiple files using the Files API. The following code gets a list of all the files uploaded: Python print ( 'My files:' ) for f in client . files . list (): print ( ' ' , f . name ) JavaScript const listResponse = await ai . files . list ({ config : { pageSize : 10 } }); for await ( const file of listResponse ) { console . log ( file . name ); } Go iter := client . ListFiles ( ctx ) for { ifile , err := iter . Next () if err == iterator . Done { break } if err != nil { log . Fatal ( err ) } fmt . Println ( ifile . Name ) } REST echo "My files: " curl \ No newline at end of file diff --git a/docstore/df7b8f03-2d55-4c59-b9d9-bd96d262ca8d b/docstore/df7b8f03-2d55-4c59-b9d9-bd96d262ca8d new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/df7b8f03-2d55-4c59-b9d9-bd96d262ca8d @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/df7fa8f8-4dbf-45e9-97dc-0420afd9c48a b/docstore/df7fa8f8-4dbf-45e9-97dc-0420afd9c48a new file mode 100644 index 0000000000000000000000000000000000000000..b56dfbca2298dddb94eb66ba2a6b5e2a4d7109fa --- /dev/null +++ b/docstore/df7fa8f8-4dbf-45e9-97dc-0420afd9c48a @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "' " ${ MIME1_TYPE } " '", "file_uri": ' $file1_uri '}}, { "inline_data": { "mime_type":"' " ${ MIME2_TYPE } " '", "data": "' " $IMAGE2_BASE64 " '" } } ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Object detection From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. Python from google import genai from google.genai import types from PIL import Image import json client = genai . Client () prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image . open ( "/path/to/image.png" ) config = types . GenerateContentConfig ( response_mime_type = "application/json" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , prompt ], config = config ) width , height = image . size bounding_boxes = json . loads ( response . text ) converted_bounding_boxes = [] for bounding_box in bounding_boxes : abs_y1 = int ( bounding_box [ "box_2d" ][ 0 ] / 1000 * height ) abs_x1 = int ( bounding_box [ "box_2d" ][ 1 ] / 1000 * width ) abs_y2 = int ( bounding_box [ "box_2d" ][ 2 ] / 1000 * height ) abs_x2 = int ( bounding_box [ "box_2d" ][ 3 ] / 1000 * width ) converted_bounding_boxes . append ([ abs_x1 , abs_y1 , abs_x2 , abs_y2 ]) print ( "Image size: " , width , height ) print ( "Bounding boxes:" , converted_bounding_boxes ) Note: The model also supports generating bounding boxes based on custom \ No newline at end of file diff --git a/docstore/df8e8cbd-2070-49a7-8f1e-ac200bca95a5 b/docstore/df8e8cbd-2070-49a7-8f1e-ac200bca95a5 new file mode 100644 index 0000000000000000000000000000000000000000..53d3e426b4ff6e145f59bebdb86773397956de24 --- /dev/null +++ b/docstore/df8e8cbd-2070-49a7-8f1e-ac200bca95a5 @@ -0,0 +1 @@ +field responseJsonSchema which accepts any JSON Schema with the following limitations: It only works with Gemini 2.5. While all JSON Schema properties can be passed, not all are supported. See the documentation for the field for more details. Recursive references can only be used as the value of a non-required object property. Recursive references are unrolled to a finite degree, based on the size of the schema. Schemas that contain $ref cannot contain any properties other than those starting with a $ . Here's an example of generating a JSON Schema with Pydantic and submitting it to the model: curl "https://generativelanguage.googleapis.com/v1alpha/models/\ gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d @- < setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . debug ( 'Received output transcription: %s\n' , turn . serverContent . outputTranscription . text ); } } session . close (); } async function \ No newline at end of file diff --git a/docstore/dfebdacd-955c-4c87-97d5-c83d555ef944 b/docstore/dfebdacd-955c-4c87-97d5-c83d555ef944 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/dfebdacd-955c-4c87-97d5-c83d555ef944 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/e00243a3-1447-4c89-bd3d-53b622ee7693 b/docstore/e00243a3-1447-4c89-bd3d-53b622ee7693 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/e00243a3-1447-4c89-bd3d-53b622ee7693 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/e00805da-2073-41b5-83d2-7762189d301a b/docstore/e00805da-2073-41b5-83d2-7762189d301a new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/e00805da-2073-41b5-83d2-7762189d301a @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/e0088993-4b4c-486e-9113-8371105b78d1 b/docstore/e0088993-4b4c-486e-9113-8371105b78d1 new file mode 100644 index 0000000000000000000000000000000000000000..76ace95fe58e8b11b8d859b0314962fe72bedd09 --- /dev/null +++ b/docstore/e0088993-4b4c-486e-9113-8371105b78d1 @@ -0,0 +1 @@ +() { const payload = { contents : [ { parts : [ { text : 'Explain how AI works' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Multi-turn conversations (Chat) Our SDKs provide functionality to collect multiple rounds of prompts and responses into a chat, giving you an easy way to keep track of the conversation history. Note: Chat functionality is only implemented as part of the SDKs. Behind the scenes, it still uses the generateContent API. For multi-turn conversations, the full conversation history is sent to the model with each follow-up turn. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message ( "I have 2 dogs in my house." ) print ( response . text ) response = chat . send_message ( "How many paws are in my house?" ) print ( response . text ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const response1 = await chat . sendMessage ({ message : "I have 2 dogs in my house." , }); console . log ( "Chat response 1:" , response1 . text ); const response2 = await chat . sendMessage ({ message : "How many paws are in \ No newline at end of file diff --git a/docstore/e00f5a6f-27ce-41ab-ad94-6d992b15c645 b/docstore/e00f5a6f-27ce-41ab-ad94-6d992b15c645 new file mode 100644 index 0000000000000000000000000000000000000000..e6f71e62f1d38e5969349ef563bd1d1143e3d3e1 --- /dev/null +++ b/docstore/e00f5a6f-27ce-41ab-ad94-6d992b15c645 @@ -0,0 +1 @@ +shows you how to specify ambiance. Ambiance Prompt Generated output Color palettes play a vital role in photography, influencing the mood and conveying intended emotions. Try things like "muted orange warm tones," "natural light," "sunrise" or "sunset". For example, a warm, golden palette can infuse a romantic and atmospheric feel into a photograph. A close-up of a girl holding adorable golden retriever puppy in the park, sunlight. Cinematic close-up shot of a sad woman riding a bus in the rain, cool blue tones, sad mood. Use reference images to generate videos You can bring images to life by using Veo's image-to-video capability. You can use existing assets, or try Imagen to generate something new. Prompt Generated output Bunny with a chocolate candy bar. Bunny runs away. Negative prompts Negative prompts can be a powerful tool to help specify elements you don't want in the video. Describe what you want to discourage the model from generating after the phrase "Negative prompt". Follow these tips: ❌ Don't use instructive language or words like no or don't . For example, "No walls" or "don't show walls". ✅ Do describe what you don't want to see. For example, "wall, frame", which means that you don't want a wall or a frame in the video. Prompt Generated output Generate a short, stylized animation of a large, solitary oak tree with leaves blowing vigorously in a strong wind. The tree should have a slightly exaggerated, whimsical form, with dynamic, flowing branches. The leaves should display a variety of autumn colors, swirling and dancing in the wind. The animation should use a warm, inviting color palette. Generate a short, stylized animation of a large, solitary oak tree with leaves blowing vigorously in a strong wind. The tree should have a slightly exaggerated, whimsical form, with dynamic, flowing branches. The leaves should display a variety of autumn colors, swirling and dancing in the wind. The animation should use a warm, inviting color palette. With negative \ No newline at end of file diff --git a/docstore/e0109722-7d97-4281-8dcb-2bdc4f4e8553 b/docstore/e0109722-7d97-4281-8dcb-2bdc4f4e8553 new file mode 100644 index 0000000000000000000000000000000000000000..257bd26d73ffcf83a4c86e6ef658baba1dfda511 --- /dev/null +++ b/docstore/e0109722-7d97-4281-8dcb-2bdc4f4e8553 @@ -0,0 +1 @@ +brightness of the lights, 0.0 is off, 1.0 is full. Returns: A dictionary containing the new brightness setting. """ return { "brightness" : brightness } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ power_disco_ball_impl , start_music_impl , dim_lights_impl ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Do everything you need to this place into party!" , config = config , ) print ( " \n Example 2: Automatic function calling" ) print ( response . text ) # I've turned on the disco ball, started playing loud and energetic music, and dimmed the lights to 50% brightness. Let's get this party started! Compositional function calling Compositional or sequential function calling allows Gemini to chain multiple function calls together to fulfill a complex request. For example, to answer "Get the temperature in my current location", the Gemini API might first invoke a get_current_location() function followed by a get_weather() function that takes the location as a parameter. The following example demonstrates how to implement compositional function calling using the Python SDK and automatic function calling. Python This example uses the automatic function calling feature of the google-genai Python SDK. The SDK automatically converts the Python functions to the required schema, executes the function calls when requested by the model, and sends the results back to the model to complete the task. import os from google import genai from google.genai import types # Example Functions def get_weather_forecast ( location : str ) - > dict : """Gets the current weather temperature for a given location.""" print ( f "Tool Call: get_weather_forecast(location= { location } )" ) # TODO: Make API call print ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ) return { "temperature" : 25 , "unit" : "celsius" } # Dummy response def set_thermostat_temperature ( \ No newline at end of file diff --git a/docstore/e0372969-71f9-488d-b884-930d7f199245 b/docstore/e0372969-71f9-488d-b884-930d7f199245 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/e0372969-71f9-488d-b884-930d7f199245 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/e03a3ecf-72f6-4ecc-b463-515c9bf7dfe5 b/docstore/e03a3ecf-72f6-4ecc-b463-515c9bf7dfe5 new file mode 100644 index 0000000000000000000000000000000000000000..00c5cdc92c5adc1f652744cb7f71ce82eb12e3dc --- /dev/null +++ b/docstore/e03a3ecf-72f6-4ecc-b463-515c9bf7dfe5 @@ -0,0 +1 @@ +( 'https://discovery.ucl.ac.uk/id/eprint/10089234/1/343019_3_art_0_py4t4l_convrt.pdf' ) . then (( response ) = > response . arrayBuffer ()); const contents = [ { text : "Summarize this document" }, { inlineData : { mimeType : 'application/pdf' , data : Buffer . from ( pdfResp ). toString ( "base64" ) } } ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "io" "net/http" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , _ := genai . NewClient ( ctx , & genai . ClientConfig { APIKey : os . Getenv ( "GEMINI_API_KEY" ), Backend : genai . BackendGeminiAPI , }) pdfResp , _ := http . Get ( "https://discovery.ucl.ac.uk/id/eprint/10089234/1/343019_3_art_0_py4t4l_convrt.pdf" ) var pdfBytes [] byte if pdfResp != nil && pdfResp . Body != nil { pdfBytes , _ = io . ReadAll ( pdfResp . Body ) pdfResp . Body . Close () } parts := [] * genai . Part { & genai . Part { InlineData : & genai . Blob { MIMEType : "application/pdf" , Data : pdfBytes , }, }, genai . NewPartFromText ( "Summarize this document" ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST DOC_URL = "https://discovery.ucl.ac.uk/id/eprint/10089234/1/343019_3_art_0_py4t4l_convrt.pdf" PROMPT = "Summarize this document" DISPLAY_NAME = "base64_pdf" # Download the PDF wget -O " ${ DISPLAY_NAME } .pdf" " ${ DOC_URL } " # Check for FreeBSD base64 and set flags accordingly if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi # Base64 encode the PDF ENCODED_PDF = $( base64 $B64FLAGS " ${ DISPLAY_NAME } .pdf" ) # Generate content using the base64 encoded PDF curl \ No newline at end of file diff --git a/docstore/e0475e51-8a05-4e11-9814-52203cdbabb8 b/docstore/e0475e51-8a05-4e11-9814-52203cdbabb8 new file mode 100644 index 0000000000000000000000000000000000000000..7dc87b548e2d57526821a9c12df5e47c7e7e0e83 --- /dev/null +++ b/docstore/e0475e51-8a05-4e11-9814-52203cdbabb8 @@ -0,0 +1 @@ +. thoughtsTokenCount } ` ); console . log ( `Output tokens: ${ response . usageMetadata . candidatesTokenCount } ` ); Go // ... usageMetadata , err := json . MarshalIndent ( response . UsageMetadata , "" , " " ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Thoughts tokens:" , string ( usageMetadata . thoughts_token_count )) fmt . Println ( "Output tokens:" , string ( usageMetadata . candidates_token_count )) Thinking models generate full thoughts to improve the quality of the final response, and then output summaries to provide insight into the thought process. So, pricing is based on the full thought tokens the model needs to generate to create a summary, despite only the summary being output from the API. You can learn more about tokens in the Token counting guide. Supported models Thinking features are supported on all the 2.5 series models. You can find all model capabilities on the model overview page. Best practices This section includes some guidance for using thinking models efficiently. As always, following our prompting guidance and best practices will get you the best results. Debugging and steering Review reasoning : When you're not getting your expected response from the thinking models, it can help to carefully analyze Gemini's thought summaries. You can see how it broke down the task and arrived at its conclusion, and use that information to correct towards the right results. Provide Guidance in Reasoning : If you're hoping for a particularly lengthy output, you may want to provide guidance in your prompt to constrain the amount of thinking the model uses. This lets you reserve more of the token output for your response. Task complexity Easy Tasks (Thinking could be OFF): For straightforward requests where complex reasoning isn't required, such as fact retrieval or classification, thinking is not required. Examples include: "Where was DeepMind founded?" "Is this email asking for a meeting or just providing information?" Medium Tasks \ No newline at end of file diff --git a/docstore/e04e22d7-57f3-4947-8f81-88337a61e207 b/docstore/e04e22d7-57f3-4947-8f81-88337a61e207 new file mode 100644 index 0000000000000000000000000000000000000000..91fc78231d919db46ae330599c0572b8fce05ec3 --- /dev/null +++ b/docstore/e04e22d7-57f3-4947-8f81-88337a61e207 @@ -0,0 +1 @@ +( "Summarize this document" ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } Uploading PDFs using the File API You can use the File API to upload larger documents. Always use the File API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20MB. Note: The File API lets you store up to 50MB of PDF files. Files are stored for 48 hours. You can access them in that period with your API key, but you can't download them from the API. The File API is available at no cost in all regions where the Gemini API is available. Call media.upload to upload a file using the File API. The following code uploads a document file and then uses the file in a call to models.generateContent . Large PDFs from URLs Use the File API to simplify uploading and processing large PDF files from URLs: Python from google import genai from google.genai import types import io import httpx client = genai . Client () long_context_pdf_path = "https://www.nasa.gov/wp-content/uploads/static/history/alsj/a17/A17_FlightPlan.pdf" # Retrieve and upload the PDF using the File API doc_io = io . BytesIO ( httpx . get ( long_context_pdf_path ) . content ) sample_doc = client . files . upload ( # You can pass a path or a file-like object here file = doc_io , config = dict ( mime_type = 'application/pdf' ) ) prompt = "Summarize this document" response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ sample_doc , prompt ]) print ( response . text ) JavaScript import { createPartFromUri , GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); async function main () { const pdfBuffer = await fetch ( "https://www.nasa.gov/wp-content/uploads/static/history/alsj/a17/A17_FlightPlan.pdf" ) . then (( response ) = > \ No newline at end of file diff --git a/docstore/e05f94a0-fd6d-4874-98ca-8292ee959cf0 b/docstore/e05f94a0-fd6d-4874-98ca-8292ee959cf0 new file mode 100644 index 0000000000000000000000000000000000000000..4d2e24ea9938e140050ecec90b7146205f15036b --- /dev/null +++ b/docstore/e05f94a0-fd6d-4874-98ca-8292ee959cf0 @@ -0,0 +1 @@ +Embeddings | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Embeddings Note: Introducing our first Gemini embedding model, available now to developers as gemini-embedding-exp-03-07 in the API. The Gemini API supports several embedding models that generate embeddings for words, phrases, code, and sentences. The resulting embeddings can then be used for tasks such as semantic search, text classification, and clustering, among many others. What are embeddings? Embeddings are numerical representations of text (or other media formats) that capture relationships between inputs. Text embeddings work by converting text into arrays of floating point numbers, called vectors . These vectors are designed to capture the meaning of the text. The length of the embedding array is called the vector's dimensionality . A passage of text might be represented by a vector containing hundreds of dimensions. Embeddings capture semantic meaning and context, which results in text with similar meanings having "closer" embeddings. For example, the sentence "I took my dog to the vet" and "I took my cat to the vet" would have embeddings that are close to each other in the vector space. You can use embeddings to compare different texts and understand how they relate. For example, if the embeddings of the text "cat" and "dog" are close together you can infer that these words are similar in meaning, context, or both. This enables a variety of common AI use cases . Before you begin Before calling the Gemini API, ensure you have your SDK of choice installed, and a Gemini API key configured and ready to use. Generate embeddings Use the embedContent method to generate \ No newline at end of file diff --git a/docstore/e0752895-686e-443a-bf21-1c6c95d1af32 b/docstore/e0752895-686e-443a-bf21-1c6c95d1af32 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/e0752895-686e-443a-bf21-1c6c95d1af32 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/e07af3c8-c1d2-495c-8f78-fafae5c8d6b3 b/docstore/e07af3c8-c1d2-495c-8f78-fafae5c8d6b3 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/e07af3c8-c1d2-495c-8f78-fafae5c8d6b3 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/e07f230b-ed9c-4fdb-b2dd-4a48c91f46a7 b/docstore/e07f230b-ed9c-4fdb-b2dd-4a48c91f46a7 new file mode 100644 index 0000000000000000000000000000000000000000..cd05fff49dc646621e4ad5455e6cddce9e307548 --- /dev/null +++ b/docstore/e07f230b-ed9c-4fdb-b2dd-4a48c91f46a7 @@ -0,0 +1 @@ +models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents ) # See thought signatures for part in response . candidates [ 0 ] . content . parts : if part . thought_signature : print ( "Thought signature:" ) print ( part . thought_signature ) Returning signatures back to the server In order to return signatures back: You should return signatures along with their containing parts back to the server You shouldn't merge a part with a signature with another part which also contains a signature. The signature string is not concatenable You shouldn't merge one part with a signature with another part without a signature. This breaks the correct positioning of the thought represented by the signature. The code will remain the same as in Step 4 of the previous section. But in this case (as indicated in the comment below) you will return signatures to the model along with the result of the function execution so the model can incorporate the thoughts into its final response: Python # Step 4: Create user friendly response with function result and call the model again # ...Create a function response part (No change) # Append thought signatures, function call and result of the function execution to contents function_call_content = response . candidates [ 0 ] . content # Append the model's function call message, which includes thought signatures contents . append ( function_call_content ) contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Step 4: Create user friendly response with function result and call the model again // ...Create a function response part (No change) // Append thought signatures, function call and result of the function execution to contents const function_response_content = response . \ No newline at end of file diff --git a/docstore/e0b72ad2-7df1-461f-bfb4-336e5b092d50 b/docstore/e0b72ad2-7df1-461f-bfb4-336e5b092d50 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/e0b72ad2-7df1-461f-bfb4-336e5b092d50 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/e0c782d7-f609-4a1f-9f76-87b070e9276b b/docstore/e0c782d7-f609-4a1f-9f76-87b070e9276b new file mode 100644 index 0000000000000000000000000000000000000000..b0033fbc695240330e92f0eacef1e843c48482b9 --- /dev/null +++ b/docstore/e0c782d7-f609-4a1f-9f76-87b070e9276b @@ -0,0 +1 @@ +Supported values are "16:9" and "9:16" . The default is "16:9" . personGeneration : Allow the model to generate videos of people. The following values are supported: Text-to-video generation: "dont_allow" : Don't allow the inclusion of people or faces. "allow_adult" : Generate videos that include adults, but not children. "allow_all" : Generate videos that include adults and children. Image-to-video generation: "dont_allow" : Don't allow the inclusion of people or faces. "allow_adult" : Generate videos that include adults, but not children. See Limitations . numberOfVideos : Output videos requested, either 1 or 2 . durationSeconds : Length of each output video in seconds, between 5 and 8 . enhance_prompt : Enable or disable the prompt rewriter. Enabled by default. Specifications Modalities Text-to-video generation Image-to-video generation Request latency Min: 11 seconds Max: 6 minutes (during peak hours) Variable length generation 5-8 seconds Resolution 720p Frame rate 24fps Aspect ratio 16:9 - landscape 9:16 - portrait Input languages (text-to-video) English Limitations Image-to-video personGeneration is not allowed in EU, UK, CH, MENA locations Text-to-video personGeneration: "allow_all" is not allowed in EU, UK, CH, MENA locations Note: Check out the Models , Pricing , and Rate limits pages for more usage limitations for Veo. Videos created by Veo are watermarked using SynthID , our tool for watermarking and identifying AI-generated content, and are passed through safety filters and memorization checking processes that help mitigate privacy, copyright and bias risks. Things to try To get the most out of Veo, incorporate video-specific terminology into your prompts. Veo understands a wide range of terms related to: Shot composition: Specify the framing and number of subjects in the shot (e.g., "single shot," "two shot," "over-the-shoulder shot"). Camera positioning and movement: Control the camera's location and movement using terms like "eye level," "high \ No newline at end of file diff --git a/docstore/e0d65877-5b8a-43a1-845b-da19602e54b4 b/docstore/e0d65877-5b8a-43a1-845b-da19602e54b4 new file mode 100644 index 0000000000000000000000000000000000000000..7dc87b548e2d57526821a9c12df5e47c7e7e0e83 --- /dev/null +++ b/docstore/e0d65877-5b8a-43a1-845b-da19602e54b4 @@ -0,0 +1 @@ +. thoughtsTokenCount } ` ); console . log ( `Output tokens: ${ response . usageMetadata . candidatesTokenCount } ` ); Go // ... usageMetadata , err := json . MarshalIndent ( response . UsageMetadata , "" , " " ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Thoughts tokens:" , string ( usageMetadata . thoughts_token_count )) fmt . Println ( "Output tokens:" , string ( usageMetadata . candidates_token_count )) Thinking models generate full thoughts to improve the quality of the final response, and then output summaries to provide insight into the thought process. So, pricing is based on the full thought tokens the model needs to generate to create a summary, despite only the summary being output from the API. You can learn more about tokens in the Token counting guide. Supported models Thinking features are supported on all the 2.5 series models. You can find all model capabilities on the model overview page. Best practices This section includes some guidance for using thinking models efficiently. As always, following our prompting guidance and best practices will get you the best results. Debugging and steering Review reasoning : When you're not getting your expected response from the thinking models, it can help to carefully analyze Gemini's thought summaries. You can see how it broke down the task and arrived at its conclusion, and use that information to correct towards the right results. Provide Guidance in Reasoning : If you're hoping for a particularly lengthy output, you may want to provide guidance in your prompt to constrain the amount of thinking the model uses. This lets you reserve more of the token output for your response. Task complexity Easy Tasks (Thinking could be OFF): For straightforward requests where complex reasoning isn't required, such as fact retrieval or classification, thinking is not required. Examples include: "Where was DeepMind founded?" "Is this email asking for a meeting or just providing information?" Medium Tasks \ No newline at end of file diff --git a/docstore/e0e43bdc-f525-46ac-93f6-1e5ca1ba167f b/docstore/e0e43bdc-f525-46ac-93f6-1e5ca1ba167f new file mode 100644 index 0000000000000000000000000000000000000000..e2cc628b8f08495f837a8dc2b8647931cfc4f4e1 --- /dev/null +++ b/docstore/e0e43bdc-f525-46ac-93f6-1e5ca1ba167f @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#gemini-1.5-flash Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/e0e69351-dcc7-4eb2-a632-8bc9ec7e81c8 b/docstore/e0e69351-dcc7-4eb2-a632-8bc9ec7e81c8 new file mode 100644 index 0000000000000000000000000000000000000000..0d39ec227c9e73c3d3b5117917b80f593b500b4b --- /dev/null +++ b/docstore/e0e69351-dcc7-4eb2-a632-8bc9ec7e81c8 @@ -0,0 +1 @@ +async function main () { const ai = new GoogleGenAI ({}); const imageUrl = "https://goo.gle/instrument-img" ; const response = await fetch ( imageUrl ); const imageArrayBuffer = await response . arrayBuffer (); const base64ImageData = Buffer . from ( imageArrayBuffer ). toString ( 'base64' ); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ { inlineData : { mimeType : 'image/jpeg' , data : base64ImageData , }, }, { text : "Caption this image." } ], }); console . log ( result . text ); } main (); Go package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Download the image. imageResp , _ := http . Get ( "https://goo.gle/instrument-img" ) imageBytes , _ := io . ReadAll ( imageResp . Body ) parts := [] * genai . Part { genai . NewPartFromBytes ( imageBytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMG_URL = "https://goo.gle/instrument-img" MIME_TYPE = $( curl -sIL " $IMG_URL " | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1 ) if [[ -z " $MIME_TYPE " || ! " $MIME_TYPE " == image/* ]] ; then MIME_TYPE = "image/jpeg" fi # Check for macOS if [[ " $( uname ) " == "Darwin" ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -b 0 ) elif [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 ) else IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -w0 ) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ \ No newline at end of file diff --git a/docstore/e11e712c-2a24-4b3e-8d16-912e371e0077 b/docstore/e11e712c-2a24-4b3e-8d16-912e371e0077 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/e11e712c-2a24-4b3e-8d16-912e371e0077 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/e14c7710-76c6-4b5c-8ddc-6135dd57d1a2 b/docstore/e14c7710-76c6-4b5c-8ddc-6135dd57d1a2 new file mode 100644 index 0000000000000000000000000000000000000000..cb7319bb995c708a315638a1177ba448f5c39210 --- /dev/null +++ b/docstore/e14c7710-76c6-4b5c-8ddc-6135dd57d1a2 @@ -0,0 +1 @@ +: "Powers the spinning disco ball." , "parameters" : { "type" : "object" , "properties" : { "power" : { "type" : "boolean" , "description" : "Whether to turn the disco ball on or off." , } }, "required" : [ "power" ], }, } start_music = { "name" : "start_music" , "description" : "Play some music matching the specified parameters." , "parameters" : { "type" : "object" , "properties" : { "energetic" : { "type" : "boolean" , "description" : "Whether the music is energetic or not." , }, "loud" : { "type" : "boolean" , "description" : "Whether the music is loud or not." , }, }, "required" : [ "energetic" , "loud" ], }, } dim_lights = { "name" : "dim_lights" , "description" : "Dim the lights." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "number" , "description" : "The brightness of the lights, 0.0 is off, 1.0 is full." , } }, "required" : [ "brightness" ], }, } JavaScript import { Type } from '@google/genai' ; const powerDiscoBall = { name : 'power_disco_ball' , description : 'Powers the spinning disco ball.' , parameters : { type : Type . OBJECT , properties : { power : { type : Type . BOOLEAN , description : 'Whether to turn the disco ball on or off.' } }, required : [ 'power' ] } }; const startMusic = { name : 'start_music' , description : 'Play some music matching the specified parameters.' , parameters : { type : Type . OBJECT , properties : { energetic : { type : Type . BOOLEAN , description : 'Whether the music is energetic or not.' }, loud : { type : Type . BOOLEAN , description : 'Whether the music is loud or not.' } }, required : [ 'energetic' , 'loud' ] } }; const dimLights = { name : 'dim_lights' , description : 'Dim the lights.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'The brightness of the lights, 0.0 is off, 1.0 is full.' } }, required : [ 'brightness' ] } }; Configure the function calling mode to allow using all of the specified tools. To learn more, \ No newline at end of file diff --git a/docstore/e18d78e1-4e71-4e79-8728-f3f65a608c44 b/docstore/e18d78e1-4e71-4e79-8728-f3f65a608c44 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/e18d78e1-4e71-4e79-8728-f3f65a608c44 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/e1936075-71c5-4a28-8695-6292812c5208 b/docstore/e1936075-71c5-4a28-8695-6292812c5208 new file mode 100644 index 0000000000000000000000000000000000000000..c72243f466a39b8c76a4237da0daa8b99ecbbe13 --- /dev/null +++ b/docstore/e1936075-71c5-4a28-8695-6292812c5208 @@ -0,0 +1 @@ +}, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST # Use a temporary file to hold the base64 encoded image data TEMP_B64 = $( mktemp ) trap 'rm -f "$TEMP_B64"' EXIT base64 $B64FLAGS $IMG_PATH > " $TEMP_B64 " # Use a temporary file to hold the JSON payload TEMP_JSON = $( mktemp ) trap 'rm -f "$TEMP_JSON"' EXIT cat > " $TEMP_JSON " << EOF { "contents" : [ { "parts" : [ { "text" : "Tell me about this instrument" } , { "inline_data" : { "mime_type" : "image/jpeg" , "data" : " $( cat " $TEMP_B64 " ) " } } ] } ] } EOF curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d "@ $TEMP_JSON " Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const imageUrl = 'http://image/url' ; const image = getImageData ( imageUrl ); const payload = { contents : [ { parts : [ { image }, { text : 'Tell me about this instrument' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } function getImageData ( url ) { const blob = UrlFetchApp . fetch ( url ). getBlob (); return { mimeType : blob . getContentType (), data : Utilities . base64Encode ( blob . getBytes ()) }; } \ No newline at end of file diff --git a/docstore/e19c3183-f241-4a1c-a271-4a9b4efb96a0 b/docstore/e19c3183-f241-4a1c-a271-4a9b4efb96a0 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/e19c3183-f241-4a1c-a271-4a9b4efb96a0 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/e19f2d48-3a60-4b1e-9c14-1d2886f208d3 b/docstore/e19f2d48-3a60-4b1e-9c14-1d2886f208d3 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/e19f2d48-3a60-4b1e-9c14-1d2886f208d3 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/e1c1641f-6ac0-4cb3-91a0-29a9206fe03b b/docstore/e1c1641f-6ac0-4cb3-91a0-29a9206fe03b new file mode 100644 index 0000000000000000000000000000000000000000..44112a5590578bd0d81bd7d1053c465e40ca11dd --- /dev/null +++ b/docstore/e1c1641f-6ac0-4cb3-91a0-29a9206fe03b @@ -0,0 +1 @@ +tokens for other Live API models Supported languages Live API supports the following languages. Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Language BCP-47 Code Language BCP-47 Code German (Germany) de-DE English (Australia)* en-AU English (UK)* en-GB English (India) en-IN English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Portuguese (Brazil) pt-BR Arabic (Generic) ar-XA Spanish (Spain)* es-ES French (Canada)* fr-CA Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Bengali (India) bn-IN Gujarati (India)* gu-IN Kannada (India)* kn-IN Marathi (India) mr-IN Malayalam (India)* ml-IN Tamil (India) ta-IN Telugu (India) te-IN Dutch (Netherlands) nl-NL Korean (South Korea) ko-KR Mandarin Chinese (China)* cmn-CN Polish (Poland) pl-PL Russian (Russia) ru-RU Thai (Thailand) th-TH Languages marked with an asterisk (*) are not available for Native audio . What's next Read the Tool Use and Session Management guides for essential information on using the Live API effectively. Try the Live API in Google AI Studio . For more info about the Live API models, see Gemini 2.0 Flash Live and Gemini 2.5 Flash Native Audio on the Models page. Try more examples in the Live API cookbook , the Live API Tools cookbook , and the Live API Get Started script . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/e1c19c43-4784-41d7-ac9b-e98271221e7c b/docstore/e1c19c43-4784-41d7-ac9b-e98271221e7c new file mode 100644 index 0000000000000000000000000000000000000000..8b1657bd0e8a588bad85661db832fbb0d660fdd6 --- /dev/null +++ b/docstore/e1c19c43-4784-41d7-ac9b-e98271221e7c @@ -0,0 +1 @@ +serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Incremental content updates Use incremental updates to send text input, establish session context, or restore session context. For short contexts you can send turn-by-turn interactions to represent the exact sequence of events: Python turns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }]}, { "role" : "model" , "parts" : [{ "text" : "Paris" }]}, ] await session . send_client_content ( turns = turns , turn_complete = False ) turns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }]}] await session . send_client_content ( turns = turns , turn_complete = True ) JavaScript let inputTurns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }] }, { "role" : "model" , "parts" : [{ "text" : "Paris" }] }, ] session . sendClientContent ({ turns : inputTurns , turnComplete : false }) inputTurns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }] }] session . sendClientContent ({ turns : inputTurns , turnComplete : true }) For longer contexts \ No newline at end of file diff --git a/docstore/e1cd5546-c405-4a2e-9c9c-1e21dec4e715 b/docstore/e1cd5546-c405-4a2e-9c9c-1e21dec4e715 new file mode 100644 index 0000000000000000000000000000000000000000..4a8a7222dfc27acfaa73b21a084913914a78851b --- /dev/null +++ b/docstore/e1cd5546-c405-4a2e-9c9c-1e21dec4e715 @@ -0,0 +1 @@ +"fmt" "google.golang.org/genai" "os" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := genai . Text ( "What is the sum of the first 50 prime numbers?" ) model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { IncludeThoughts : true , }, }) for _ , part := range resp . Candidates [ 0 ]. Content . Parts { if part . Text != "" { if part . Thought { fmt . Println ( "Thoughts Summary:" ) fmt . Println ( part . Text ) } else { fmt . Println ( "Answer:" ) fmt . Println ( part . Text ) } } } } And here is an example using thinking with streaming, which returns rolling, incremental summaries during generation: Python from google import genai from google.genai import types client = genai . Client () prompt = """ Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own? """ thoughts = "" answer = "" for chunk in client . models . generate_content_stream ( model = "gemini-2.5-pro" , contents = prompt , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( include_thoughts = True ) ) ): for part in chunk . candidates [ 0 ] . content . parts : if not part . text : continue elif part . thought : if not thoughts : print ( "Thoughts summary:" ) print ( part . text ) thoughts += part . text else : if not answer : print ( "Answer:" ) print ( part . text ) answer += part . text JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const prompt = `Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The \ No newline at end of file diff --git a/docstore/e1d41693-abd9-470b-9ed5-149a9caa03a0 b/docstore/e1d41693-abd9-470b-9ed5-149a9caa03a0 new file mode 100644 index 0000000000000000000000000000000000000000..f3a5c8d51af4fe74b88a61d9a283d0c7a963f683 --- /dev/null +++ b/docstore/e1d41693-abd9-470b-9ed5-149a9caa03a0 @@ -0,0 +1 @@ +workflows. The model also supports calling multiple functions in a single turn ( parallel function calling ) and in sequence ( compositional function calling ). Step 1: Define a function declaration Define a function and its declaration within your application code that allows users to set light values and make an API request. This function could call external services or APIs. Python # Define a function that the model can call to control smart lights set_light_values_declaration = { "name" : "set_light_values" , "description" : "Sets the brightness and color temperature of a light." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "integer" , "description" : "Light level from 0 to 100. Zero is off and 100 is full brightness" , }, "color_temp" : { "type" : "string" , "enum" : [ "daylight" , "cool" , "warm" ], "description" : "Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`." , }, }, "required" : [ "brightness" , "color_temp" ], }, } # This is the actual function that would be called based on the model's suggestion def set_light_values ( brightness : int , color_temp : str ) - > dict [ str , int | str ]: """Set the brightness and color temperature of a room light. (mock API). Args: brightness: Light level from 0 to 100. Zero is off and 100 is full brightness color_temp: Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. Returns: A dictionary containing the set brightness and color temperature. """ return { "brightness" : brightness , "colorTemperature" : color_temp } JavaScript import { Type } from '@google/genai' ; // Define a function that the model can call to control smart lights const setLightValuesFunctionDeclaration = { name : 'set_light_values' , description : 'Sets the brightness and color temperature of a light.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'Light level from 0 to 100. Zero is off \ No newline at end of file diff --git a/docstore/e1e007df-8066-4d6c-933a-4fad6e685b87 b/docstore/e1e007df-8066-4d6c-933a-4fad6e685b87 new file mode 100644 index 0000000000000000000000000000000000000000..cc324c3c93704507290a78661441fbf69d3fe892 --- /dev/null +++ b/docstore/e1e007df-8066-4d6c-933a-4fad6e685b87 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/live-session#session-resumption Title: Session management with Live API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/e201f00c-8a4d-4f82-adf9-262487a2ea71 b/docstore/e201f00c-8a4d-4f82-adf9-262487a2ea71 new file mode 100644 index 0000000000000000000000000000000000000000..109bd0eaeef926ee02f7659dee842724a4ad9423 --- /dev/null +++ b/docstore/e201f00c-8a4d-4f82-adf9-262487a2ea71 @@ -0,0 +1 @@ +the libraries are stable and fully supported for production use. They are actively maintained, provide access to the latest features, and offer the best performance working with Gemini. If you're not using the Google GenAI SDK and using one of our legacy libraries, we strongly recommend you to migrate. Review the legacy libraries section for more information. Legacy libraries and migration If you are using one of our legacy libraries, we recommend that you migrate to the new libraries . The legacy libraries don't provide access to recent features (such as Live API and Veo ) and are on a deprecation path. They will stop receiving updates at the end of September 2025, the feature gaps will grow and potential bugs may no longer get fixed. Each legacy library's support status varies, detailed in the following table: Language Legacy library Support status Recommended library Python google-generativeai All support, including bug fixes, ends end of September 2025. google-genai JavaScript/TypeScript @google/generativeai All support, including bug fixes, ends end of September 2025. @google/genai Go google.golang.org/generative-ai All support, including bug fixes, ends end of September 2025. google.golang.org/genai Dart and Flutter google_generative_ai Not actively maintained Use trusted community or third party libraries, like firebase_ai , or access using REST API Swift generative-ai-swift Not actively maintained Use Gemini in Firebase Android generative-ai-android Not actively maintained Use Gemini in Firebase Note for Java developers: There was no legacy Google-provided Java SDK for the Gemini API, so no migration from a previous Google library is required. You can start directly with the new library in the Language support and installation section. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google \ No newline at end of file diff --git a/docstore/e222aa18-f74e-49af-84f1-9d416f86de80 b/docstore/e222aa18-f74e-49af-84f1-9d416f86de80 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/e222aa18-f74e-49af-84f1-9d416f86de80 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/e255479e-633b-4e32-a37e-6ff82c683415 b/docstore/e255479e-633b-4e32-a37e-6ff82c683415 new file mode 100644 index 0000000000000000000000000000000000000000..1b590529498df147a73d1ae4a22f439d08a36cb6 --- /dev/null +++ b/docstore/e255479e-633b-4e32-a37e-6ff82c683415 @@ -0,0 +1 @@ +from "node:fs" ; const ai = new GoogleGenAI ({}); const base64ImageFile = fs . readFileSync ( "path/to/small-sample.jpg" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "image/jpeg" , data : base64ImageFile , }, }, { text : "Caption this image." }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go bytes , _ := os . ReadFile ( "path/to/small-sample.jpg" ) parts := [] * genai . Part { genai . NewPartFromBytes ( bytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST IMG_PATH = "/path/to/your/image1.jpg" if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "' " $( base64 $B64FLAGS $IMG_PATH ) " '" } }, {"text": "Caption this image."}, ] }] }' 2 > /dev/null You can also fetch an image from a URL, convert it to bytes, and pass it to generateContent as shown in the following examples. Python from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests . get ( image_path ) . content image = types . Part . from_bytes ( data = image_bytes , mime_type = "image/jpeg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is this image?" , image ], ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; \ No newline at end of file diff --git a/docstore/e2588bbc-f8be-4f87-bd39-39142e8085c3 b/docstore/e2588bbc-f8be-4f87-bd39-39142e8085c3 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/e2588bbc-f8be-4f87-bd39-39142e8085c3 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/e267a59c-ec73-44e0-852b-f97ad834f9af b/docstore/e267a59c-ec73-44e0-852b-f97ad834f9af new file mode 100644 index 0000000000000000000000000000000000000000..e01fb45cbed3453bf1ca90f079ac96b78e0dc7ff --- /dev/null +++ b/docstore/e267a59c-ec73-44e0-852b-f97ad834f9af @@ -0,0 +1 @@ +Asynchronous function calling is only supported in half-cascade audio generation. Function calling executes sequentially by default, meaning execution pauses until the results of each function call are available. This ensures sequential processing, which means you won't be able to continue interacting with the model while the functions are being run. If you don't want to block the conversation, you can tell the model to run the functions asynchronously. To do so, you first need to add a behavior to the function definitions: Python # Non-blocking function definitions turn_on_the_lights = { "name" : "turn_on_the_lights" , "behavior" : "NON_BLOCKING" } # turn_on_the_lights will run asynchronously turn_off_the_lights = { "name" : "turn_off_the_lights" } # turn_off_the_lights will still pause all interactions with the model JavaScript import { GoogleGenAI , Modality , Behavior } from '@google/genai' ; // Non-blocking function definitions const turn_on_the_lights = { name : "turn_on_the_lights" , behavior : Behavior . NON_BLOCKING } // Blocking function definitions const turn_off_the_lights = { name : "turn_off_the_lights" } const tools = [{ functionDeclarations : [ turn_on_the_lights , turn_off_the_lights ] }] NON-BLOCKING ensures the function runs asynchronously while you can continue interacting with the model. Then you need to tell the model how to behave when it receives the FunctionResponse using the scheduling parameter. It can either: Interrupt what it's doing and tell you about the response it got right away ( scheduling="INTERRUPT" ), Wait until it's finished with what it's currently doing ( scheduling="WHEN_IDLE" ), Or do nothing and use that knowledge later on in the discussion ( scheduling="SILENT" ) Python # for a non-blocking function definition, apply scheduling in the function response: function_response = types . FunctionResponse ( id = fc . id , name = fc . name , response = { "result" : "ok" , "scheduling" : "INTERRUPT" # Can also be WHEN_IDLE or \ No newline at end of file diff --git a/docstore/e268f0b8-852e-4a4e-a8fb-14d62e2ad962 b/docstore/e268f0b8-852e-4a4e-a8fb-14d62e2ad962 new file mode 100644 index 0000000000000000000000000000000000000000..f361e30d58eacc3ddab2c6cd66cf7090d9f68a07 --- /dev/null +++ b/docstore/e268f0b8-852e-4a4e-a8fb-14d62e2ad962 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/live-session#main-content Title: Session management with Live API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/e290166b-e8d5-4974-8d63-23c3adb21ca5 b/docstore/e290166b-e8d5-4974-8d63-23c3adb21ca5 new file mode 100644 index 0000000000000000000000000000000000000000..4542d68c23b70ac5014267f60edbcffd2138877f --- /dev/null +++ b/docstore/e290166b-e8d5-4974-8d63-23c3adb21ca5 @@ -0,0 +1 @@ +, headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Streaming can also be used for multi-turn conversations. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message_stream ( "I have 2 dogs in my house." ) for chunk in response : print ( chunk . text , end = "" ) response = chat . send_message_stream ( "How many paws are in my house?" ) for chunk in response : print ( chunk . text , end = "" ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const stream1 = await chat . sendMessageStream ({ message : "I have 2 dogs in my house." , }); for await ( const chunk of stream1 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } const stream2 = await chat . sendMessageStream ({ message : "How many paws are in my house?" , }); for await ( const chunk of stream2 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . \ No newline at end of file diff --git a/docstore/e2a333ff-9b2e-4520-bb61-a5847d1db3f7 b/docstore/e2a333ff-9b2e-4520-bb61-a5847d1db3f7 new file mode 100644 index 0000000000000000000000000000000000000000..12eaa70edcc32ba40cc62f56233fa0adea94b474 --- /dev/null +++ b/docstore/e2a333ff-9b2e-4520-bb61-a5847d1db3f7 @@ -0,0 +1 @@ +Gemini API libraries | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini API libraries When building with the Gemini API, we recommend using our official collection of libraries across major languages: the Google GenAI SDK . They are production ready under General Availability . Our samples and documentation across this site are built using these libraries. Note: If you're using one of our legacy libraries, we strongly recommend you migrate to the Google GenAI SDK. Review the legacy libraries section for more information. If you're new to the Gemini API, follow our quickstart guide to get started. Language support and installation The Google GenAI SDK is available for the Python, JavaScript/TypeScript, Go and Java languages. You can install each language's library using package managers, or visit their GitHub repos for further engagement: Python Library: google-genai GitHub Repository: googleapis/python-genai Installation: pip install google-genai JavaScript Library: @google/genai GitHub Repository: googleapis/js-genai Installation: npm install @google/genai Go Library: google.golang.org/genai GitHub Repository: googleapis/go-genai Installation: go get google.golang.org/genai Java Library: google-genai GitHub Repository: googleapis/java-genai Installation: If you're using Maven, add the following to your dependencies: com.google.genai google-genai 1.0.0 General availability We started rolling out the Google GenAI SDK in late 2024. As of May 2025, it reached General Availability (GA) across all supported platforms. This means \ No newline at end of file diff --git a/docstore/e2ac27be-5fa8-4f58-bd0c-6f93dd7cd0d9 b/docstore/e2ac27be-5fa8-4f58-bd0c-6f93dd7cd0d9 new file mode 100644 index 0000000000000000000000000000000000000000..bf98246a4d5f20dab4e649ac0598b2bfac1851f5 --- /dev/null +++ b/docstore/e2ac27be-5fa8-4f58-bd0c-6f93dd7cd0d9 @@ -0,0 +1 @@ +" ) REST tmp_batch_input_file = batch_input.tmp echo -e '{"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}], "generationConfig": {"temperature": 0.7}}\n{"contents": [{"parts": [{"text": "What are the main ingredients in a Margherita pizza?"}]}]}' > batch_input.tmp MIME_TYPE = $( file -b --mime-type " ${ tmp_batch_input_file } " ) NUM_BYTES = $( wc -c < " ${ tmp_batch_input_file } " ) DISPLAY_NAME = BatchInput tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files \ -D " ${ tmp_header_file } " \ -H " x-goog-api-key: $GEMINI_API_KEY " \ -H " X-Goog-Upload-Protocol: resumable " \ -H " X-Goog-Upload-Command: start " \ -H " X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H " X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H " Content-Type: application/jsonl " \ -d " { 'file' : { 'display_name' : '${DISPLAY_NAME}' }} " 2> /dev/null upload_url= $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H " Content-Length: ${ NUM_BYTES } " \ -H " X-Goog-Upload-Offset: 0 " \ -H " X-Goog-Upload-Command: upload, finalize " \ --data-binary " @ ${ tmp_batch_input_file } " 2> /dev/null > file_info.json file_uri= $( jq ".file.uri" file_info.json ) The following example calls the BatchGenerateContent method with the input file uploaded using File API: Python # Assumes `uploaded_file` is the file object from the previous step file_batch_job = client . batches . create ( model = "gemini-2.5-flash" , src = uploaded_file . name , config = { 'display_name' : "file-upload-job-1" , }, ) print ( f "Created batch job: { file_batch_job . name } " ) REST BATCH_INPUT_FILE = 'files/123456' # File ID curl \ No newline at end of file diff --git a/docstore/e2d8ecbc-4b3b-4728-817b-de5424a37756 b/docstore/e2d8ecbc-4b3b-4728-817b-de5424a37756 new file mode 100644 index 0000000000000000000000000000000000000000..fc6b27785ffc2bef06b5f2067401216e3ff34ff1 --- /dev/null +++ b/docstore/e2d8ecbc-4b3b-4728-817b-de5424a37756 @@ -0,0 +1 @@ +"cheeseburger" was excluded from the output because it wasn't a part of the order. Constraints Specify any constraints on reading the prompt or generating a response. You can tell the model what to do and not to do. For example, you can specify a constraint in the prompt on how long you want a summary to be: Prompt: Summarize this text in one sentence: Text: A quantum computer exploits quantum mechanical phenomena to perform calculations exponentially faster than any modern traditional computer. At very tiny scales, physical matter acts as both particles and as waves, and quantum computing uses specialized hardware to leverage this behavior. The operating principles of quantum devices is beyond the scope of classical physics. When deployed at scale, quantum computers could be used in a wide variety of applications such as: in cybersecurity to break existing encryption methods while helping researchers create new ones, in meteorology to develop better weather forecasting etc. However, the current state of the art quantum computers are still largely experimental and impractical. Response: Exploiting quantum mechanical phenomena, quantum computers can perform calculations exponentially faster than traditional computers for potential applications like cybersecurity and meteorology, although they are currently largely experimental and impractical. (gemini-2.5-flash) Response format You can give instructions that specify the format of the response. For example, you can ask for the response to be formatted as a table, bulleted list, elevator pitch, keywords, sentence, or paragraph. The following system instruction tells the model to be more conversational in its response: System instruction All questions should be answered comprehensively with details, unless the user requests a concise response specifically. Prompt What is a smart way to make a business that sells DVD's in 2025? Response: Starting a business selling DVDs in 2025 presents significant challenges due to the \ No newline at end of file diff --git a/docstore/e2dfddae-9337-4249-9019-ba48fdc4431c b/docstore/e2dfddae-9337-4249-9019-ba48fdc4431c new file mode 100644 index 0000000000000000000000000000000000000000..bf98246a4d5f20dab4e649ac0598b2bfac1851f5 --- /dev/null +++ b/docstore/e2dfddae-9337-4249-9019-ba48fdc4431c @@ -0,0 +1 @@ +" ) REST tmp_batch_input_file = batch_input.tmp echo -e '{"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}], "generationConfig": {"temperature": 0.7}}\n{"contents": [{"parts": [{"text": "What are the main ingredients in a Margherita pizza?"}]}]}' > batch_input.tmp MIME_TYPE = $( file -b --mime-type " ${ tmp_batch_input_file } " ) NUM_BYTES = $( wc -c < " ${ tmp_batch_input_file } " ) DISPLAY_NAME = BatchInput tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files \ -D " ${ tmp_header_file } " \ -H " x-goog-api-key: $GEMINI_API_KEY " \ -H " X-Goog-Upload-Protocol: resumable " \ -H " X-Goog-Upload-Command: start " \ -H " X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H " X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H " Content-Type: application/jsonl " \ -d " { 'file' : { 'display_name' : '${DISPLAY_NAME}' }} " 2> /dev/null upload_url= $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H " Content-Length: ${ NUM_BYTES } " \ -H " X-Goog-Upload-Offset: 0 " \ -H " X-Goog-Upload-Command: upload, finalize " \ --data-binary " @ ${ tmp_batch_input_file } " 2> /dev/null > file_info.json file_uri= $( jq ".file.uri" file_info.json ) The following example calls the BatchGenerateContent method with the input file uploaded using File API: Python # Assumes `uploaded_file` is the file object from the previous step file_batch_job = client . batches . create ( model = "gemini-2.5-flash" , src = uploaded_file . name , config = { 'display_name' : "file-upload-job-1" , }, ) print ( f "Created batch job: { file_batch_job . name } " ) REST BATCH_INPUT_FILE = 'files/123456' # File ID curl \ No newline at end of file diff --git a/docstore/e2eb5f0e-7a8f-427c-a287-5d50a2073539 b/docstore/e2eb5f0e-7a8f-427c-a287-5d50a2073539 new file mode 100644 index 0000000000000000000000000000000000000000..c1222b1eb00e14a7d2a482f186a5d8fda014fef3 --- /dev/null +++ b/docstore/e2eb5f0e-7a8f-427c-a287-5d50a2073539 @@ -0,0 +1 @@ +person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own?` ; let thoughts = "" ; let answer = "" ; async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-pro" , contents : prompt , config : { thinkingConfig : { includeThoughts : true , }, }, }); for await ( const chunk of response ) { for ( const part of chunk . candidates [ 0 ]. content . parts ) { if ( ! part . text ) { continue ; } else if ( part . thought ) { if ( ! thoughts ) { console . log ( "Thoughts summary:" ); } console . log ( part . text ); thoughts = thoughts + part . text ; } else { if ( ! answer ) { console . log ( "Answer:" ); } console . log ( part . text ); answer = answer + part . text ; } } } } await main (); Go package main import ( "context" "fmt" "log" "os" "google.golang.org/genai" ) const prompt = ` Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own? ` func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := genai . Text ( prompt ) model := "gemini-2.5-pro" resp := client . Models . GenerateContentStream ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { IncludeThoughts : true , }, }) for chunk := range resp { for _ , part := range chunk . Candidates [ 0 ]. Content . Parts { if len ( part . Text ) == 0 { continue } if part . Thought { fmt . Printf ( "Thought: %s\n" , part . Text ) } else { fmt . Printf ( "Answer: %s\n" , part . Text ) } } } } Thought signatures \ No newline at end of file diff --git a/docstore/e2f3d0e6-9bed-48b8-a41a-9caae4446ed2 b/docstore/e2f3d0e6-9bed-48b8-a41a-9caae4446ed2 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/e2f3d0e6-9bed-48b8-a41a-9caae4446ed2 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/e2f50bff-dff6-4ce4-b131-028df479c167 b/docstore/e2f50bff-dff6-4ce4-b131-028df479c167 new file mode 100644 index 0000000000000000000000000000000000000000..bdac38d149644927b7d126aee3930efd52a696eb --- /dev/null +++ b/docstore/e2f50bff-dff6-4ce4-b131-028df479c167 @@ -0,0 +1 @@ +turn_off_the_lights_schema = { 'name' : 'turn_off_the_lights' } prompt = """ Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? """ tools = [ { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} ] await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Light control schemas const turnOnTheLightsSchema = { name : 'turn_on_the_lights' }; const turnOffTheLightsSchema = { name : 'turn_off_the_lights' }; const prompt = ` Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? ` ; const tools = [ { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } ]; await run ( prompt , tools = tools , modality = "AUDIO" ) Function calling modes The Gemini API lets you control how the model uses the provided tools (function declarations). Specifically, you can set the mode within the. function_calling_config . AUTO (Default) : The model decides whether to generate a natural language response or suggest a function call based on the prompt and context. This is the most flexible mode and recommended for most scenarios. ANY : The model is constrained to always predict a function call and guarantees function schema adherence. If allowed_function_names is not specified, the model can choose from any of the provided function declarations. If allowed_function_names is provided as a list, the model can only choose from the functions in that list. Use this mode when you require a function call response to every prompt (if applicable). NONE : The model is prohibited from making function calls. This is equivalent to sending a request without any function declarations. Use this to temporarily disable function calling without removing your tool definitions. Python from google.genai import types # Configure function calling mode tool_config = types . ToolConfig ( \ No newline at end of file diff --git a/docstore/e2f7ad1a-352e-4e27-b82f-b9c74ecf41fb b/docstore/e2f7ad1a-352e-4e27-b82f-b9c74ecf41fb new file mode 100644 index 0000000000000000000000000000000000000000..ca0d44e9961a5c85cb8aeb7b443a141c9784fb0d --- /dev/null +++ b/docstore/e2f7ad1a-352e-4e27-b82f-b9c74ecf41fb @@ -0,0 +1 @@ +Image understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image understanding Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models. Tip: In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer improved accuracy for specific use cases like object detection and segmentation , through additional training. See the Capabilities section for more details. Passing images to Gemini You can provide images as input to Gemini using two methods: Passing inline image data : Ideal for smaller files (total request size less than 20MB, including prompts). Uploading images using the File API : Recommended for larger files or for reusing images across multiple requests. Passing inline image data You can pass inline image data in the request to generateContent . You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language). The following example shows how to read an image from a local file and pass it to generateContent API for processing. Python from google.genai import types with open ( 'path/to/small-sample.jpg' , 'rb' ) as f : image_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ types . Part . from_bytes ( data = image_bytes , mime_type = 'image/jpeg' , ), 'Caption this image.' ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs \ No newline at end of file diff --git a/docstore/e316ae72-2bb4-4de8-8616-4963a1657358 b/docstore/e316ae72-2bb4-4de8-8616-4963a1657358 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/e316ae72-2bb4-4de8-8616-4963a1657358 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/e3261029-03af-413e-a7b2-07f707983905 b/docstore/e3261029-03af-413e-a7b2-07f707983905 new file mode 100644 index 0000000000000000000000000000000000000000..b0033fbc695240330e92f0eacef1e843c48482b9 --- /dev/null +++ b/docstore/e3261029-03af-413e-a7b2-07f707983905 @@ -0,0 +1 @@ +Supported values are "16:9" and "9:16" . The default is "16:9" . personGeneration : Allow the model to generate videos of people. The following values are supported: Text-to-video generation: "dont_allow" : Don't allow the inclusion of people or faces. "allow_adult" : Generate videos that include adults, but not children. "allow_all" : Generate videos that include adults and children. Image-to-video generation: "dont_allow" : Don't allow the inclusion of people or faces. "allow_adult" : Generate videos that include adults, but not children. See Limitations . numberOfVideos : Output videos requested, either 1 or 2 . durationSeconds : Length of each output video in seconds, between 5 and 8 . enhance_prompt : Enable or disable the prompt rewriter. Enabled by default. Specifications Modalities Text-to-video generation Image-to-video generation Request latency Min: 11 seconds Max: 6 minutes (during peak hours) Variable length generation 5-8 seconds Resolution 720p Frame rate 24fps Aspect ratio 16:9 - landscape 9:16 - portrait Input languages (text-to-video) English Limitations Image-to-video personGeneration is not allowed in EU, UK, CH, MENA locations Text-to-video personGeneration: "allow_all" is not allowed in EU, UK, CH, MENA locations Note: Check out the Models , Pricing , and Rate limits pages for more usage limitations for Veo. Videos created by Veo are watermarked using SynthID , our tool for watermarking and identifying AI-generated content, and are passed through safety filters and memorization checking processes that help mitigate privacy, copyright and bias risks. Things to try To get the most out of Veo, incorporate video-specific terminology into your prompts. Veo understands a wide range of terms related to: Shot composition: Specify the framing and number of subjects in the shot (e.g., "single shot," "two shot," "over-the-shoulder shot"). Camera positioning and movement: Control the camera's location and movement using terms like "eye level," "high \ No newline at end of file diff --git a/docstore/e326c0d4-938a-4ca3-9c7c-07cbb50bbe34 b/docstore/e326c0d4-938a-4ca3-9c7c-07cbb50bbe34 new file mode 100644 index 0000000000000000000000000000000000000000..facd4f50718f089582ed06a7b9e2ea144ce56d9c --- /dev/null +++ b/docstore/e326c0d4-938a-4ca3-9c7c-07cbb50bbe34 @@ -0,0 +1 @@ +instructions, such as: "Show bounding boxes of all green objects in this image". It also support custom labels like "label the items with the allergens they can contain". For more examples, check following notebooks in the Gemini Cookbook : 2D spatial understanding notebook Experimental 3D pointing notebook Segmentation Starting with Gemini 2.5, models not only detect items but also segment them and provide their contour masks. The model predicts a JSON list, where each item represents a segmentation mask. Each item has a bounding box (" box_2d ") in the format [y0, x0, y1, x1] with normalized coordinates between 0 and 1000, a label (" label ") that identifies the object, and finally the segmentation mask inside the bounding box, as base64 encoded png that is a probability map with values between 0 and 255. The mask needs to be resized to match the bounding box dimensions, then binarized at your confidence threshold (127 for the midpoint). Note: For better results, disable thinking by setting the thinking budget to 0. See code sample below for an example. Python from google import genai from google.genai import types from PIL import Image , ImageDraw import io import base64 import json import numpy as np import os client = genai . Client () def parse_json ( json_output : str ): # Parsing out the markdown fencing lines = json_output . splitlines () for i , line in enumerate ( lines ): if line == "```json" : json_output = " \n " . join ( lines [ i + 1 :]) # Remove everything before "```json" output = json_output . split ( "```" )[ 0 ] # Remove everything after the closing "```" break # Exit the loop once "```json" is found return json_output def extract_segmentation_masks ( image_path : str , output_dir : str = "segmentation_outputs" ): # Load and resize image im = Image . open ( image_path ) im . thumbnail ([ 1024 , 1024 ], Image . Resampling . LANCZOS ) prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks \ No newline at end of file diff --git a/docstore/e3379d63-11fb-4cd2-9bb4-1668cab2696d b/docstore/e3379d63-11fb-4cd2-9bb4-1668cab2696d new file mode 100644 index 0000000000000000000000000000000000000000..e8472db3f1b37b3dc2d13dd06406d0e42fc75dfb --- /dev/null +++ b/docstore/e3379d63-11fb-4cd2-9bb4-1668cab2696d @@ -0,0 +1 @@ +costing 258 tokens. Tips and best practices Verify that images are correctly rotated. Use clear, non-blurry images. When using a single image with text, place the text prompt after the image part in the contents array. What's next This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources: Files API : Learn more about uploading and managing files for use with Gemini. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/e33d16bf-bdca-4579-b264-6d1ba0233a7b b/docstore/e33d16bf-bdca-4579-b264-6d1ba0233a7b new file mode 100644 index 0000000000000000000000000000000000000000..4b3d79fcf31020903c40df052c1807fd4a690d51 --- /dev/null +++ b/docstore/e33d16bf-bdca-4579-b264-6d1ba0233a7b @@ -0,0 +1 @@ +like photography descriptors, shapes and materials, historical art movements, and image quality modifiers. Photography Prompt includes: "A photo of..." To use this style, start with using keywords that clearly tell Imagen that you're looking for a photograph. Start your prompts with "A photo of. . ." . For example: Prompt: A photo of coffee beans in a kitchen on a wooden surface Prompt: A photo of a chocolate bar on a kitchen counter Prompt: A photo of a modern building with water in the background Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Photography modifiers In the following examples, you can see several photography-specific modifiers and parameters. You can combine multiple modifiers for more precise control. Camera Proximity - Close up, taken from far away Prompt: A close-up photo of coffee beans Prompt: A zoomed out photo of a small bag of coffee beans in a messy kitchen Camera Position - aerial, from below Prompt: aerial photo of urban city with skyscrapers Prompt: A photo of a forest canopy with blue skies from below Lighting - natural, dramatic, warm, cold Prompt: studio photo of a modern arm chair, natural lighting Prompt: studio photo of a modern arm chair, dramatic lighting Camera Settings - motion blur, soft focus, bokeh, portrait Prompt: photo of a city with skyscrapers from the inside of a car with motion blur Prompt: soft focus photograph of a bridge in an urban city at night Lens types - 35mm, 50mm, fisheye, wide angle, macro Prompt: photo of a leaf, macro lens Prompt: street photography, new york city, fisheye lens Film types - black and white, polaroid Prompt: a polaroid portrait of a dog wearing sunglasses Prompt: black and white photo of a dog wearing sunglasses Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Illustration and art Prompt includes: "A painting of..." , "A sketch of..." Art styles vary from monochrome styles like pencil \ No newline at end of file diff --git a/docstore/e34c3f65-f8f8-4ffc-8c9a-252cf7bc8d42 b/docstore/e34c3f65-f8f8-4ffc-8c9a-252cf7bc8d42 new file mode 100644 index 0000000000000000000000000000000000000000..0d6f2bf32b918d3f85636fdb1c53070e3d5509f6 --- /dev/null +++ b/docstore/e34c3f65-f8f8-4ffc-8c9a-252cf7bc8d42 @@ -0,0 +1 @@ +overlay_filename = f " { item [ 'label' ] } _ { i } _overlay.png" mask . save ( os . path . join ( output_dir , mask_filename )) # Create and save overlay composite = Image . alpha_composite ( im . convert ( 'RGBA' ), overlay ) composite . save ( os . path . join ( output_dir , overlay_filename )) print ( f "Saved mask and overlay for { item [ 'label' ] } to { output_dir } " ) # Example usage if __name__ == "__main__" : extract_segmentation_masks ( "path/to/image.png" ) Check the segmentation example in the cookbook guide for a more detailed example. An example segmentation output with objects and segmentation masks Supported image formats Gemini supports the following image format MIME types: PNG - image/png JPEG - image/jpeg WEBP - image/webp HEIC - image/heic HEIF - image/heif Capabilities All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation. Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements. Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities: Gemini 2.0 models are further trained to support enhanced object detection . Gemini 2.5 models are further trained to support enhanced segmentation in addition to object detection . Limitations and key technical information File limit Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request. Token calculation Gemini 1.5 Flash and Gemini 1.5 Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens. Gemini 2.0 Flash and Gemini 2.5 Flash/Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each \ No newline at end of file diff --git a/docstore/e34ea66b-5ec9-455b-8b29-d7882a7933d2 b/docstore/e34ea66b-5ec9-455b-8b29-d7882a7933d2 new file mode 100644 index 0000000000000000000000000000000000000000..bf4a48096b84622083d96343210f25866e78f754 --- /dev/null +++ b/docstore/e34ea66b-5ec9-455b-8b29-d7882a7933d2 @@ -0,0 +1 @@ +a picture of me. Can you add a llama next to me?" ), & genai . Part { InlineData : & genai . Blob { MIMEType : "image/png" , Data : imgData , }, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } config := & genai . GenerateContentConfig { ResponseModalities : [] string { "TEXT" , "IMAGE" }, } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash-preview-image-generation" , contents , config , ) for _ , part := range result . Candidates [ 0 ]. Content . Parts { if part . Text != "" { fmt . Println ( part . Text ) } else if part . InlineData != nil { imageBytes := part . InlineData . Data outputFilename := "gemini_generated_image.png" _ = os . WriteFile ( outputFilename , imageBytes , 0644 ) } } } REST IMG_PATH = /path/to/your/image1.jpeg if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMG_BASE64 = $( base64 " $B64FLAGS " " $IMG_PATH " 2>&1 ) curl -X POST \ "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-preview-image-generation:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d "{ \"contents\": [{ \"parts\":[ {\"text\": \"'Hi, This is a picture of me. Can you add a llama next to me\"}, { \"inline_data\": { \"mime_type\":\"image/jpeg\", \"data\": \" $IMG_BASE64 \" } } ] }], \"generationConfig\": {\"responseModalities\": [\"TEXT\", \"IMAGE\"]} }" \ | grep -o '"data": "[^"]*"' \ | cut -d '"' -f4 \ | base64 --decode > gemini-edited-image.png Other image generation modes Gemini supports other image interaction modes based on prompt structure and context, including: Text to image(s) and text (interleaved): Outputs images with related text. Example prompt: "Generate an illustrated recipe for a paella." Image(s) and text to image(s) and text (interleaved) : Uses input images and text to create new related images and text. Example prompt: (With an image of a furnished room) \ No newline at end of file diff --git a/docstore/e370a874-30b8-4bf3-958d-8ed262e25555 b/docstore/e370a874-30b8-4bf3-958d-8ed262e25555 new file mode 100644 index 0000000000000000000000000000000000000000..33a8b238b28b3b4e6fb2252f6f1e5e7807510cc2 --- /dev/null +++ b/docstore/e370a874-30b8-4bf3-958d-8ed262e25555 @@ -0,0 +1 @@ +used to create the audio response: Native audio : This option provides the most natural and realistic-sounding speech and better multilingual performance. It also enables advanced features like affective (emotion-aware) dialogue , proactive audio (where the model can decide to ignore or respond to certain inputs), and "thinking" . Native audio is supported by the following native audio models : gemini-2.5-flash-preview-native-audio-dialog gemini-2.5-flash-exp-native-audio-thinking-dialog Half-cascade audio : This option uses a cascaded model architecture (native audio input and text-to-speech output). It offers better performance and reliability in production environments, especially with tool use . Half-cascaded audio is supported by the following models: gemini-live-2.5-flash-preview gemini-2.0-flash-live-001 Choose an implementation approach When integrating with Live API, you'll need to choose one of the following implementation approaches: Server-to-server : Your backend connects to the Live API using WebSockets . Typically, your client sends stream data (audio, video, text) to your server, which then forwards it to the Live API. Client-to-server : Your frontend code connects directly to the Live API using WebSockets to stream data, bypassing your backend. Note: Client-to-server generally offers better performance for streaming audio and video, since it bypasses the need to send the stream to your backend first. It's also easier to set up since you don't need to implement a proxy that sends data from your client to your server and then your server to the API. However, for production environments, in order to mitigate security risks, we recommend using ephemeral tokens instead of standard API keys. Get started This example reads a WAV file , sends it in the correct format, and saves the received data as WAV file. You can send audio by converting it to 16-bit PCM, 16kHz, mono format, and you can receive audio by setting AUDIO as response modality. The output uses \ No newline at end of file diff --git a/docstore/e37dac7c-f1fa-418c-bf8f-c6527372c7b3 b/docstore/e37dac7c-f1fa-418c-bf8f-c6527372c7b3 new file mode 100644 index 0000000000000000000000000000000000000000..6479a4b50897c899a1b9742e0d69348c2776f1d5 --- /dev/null +++ b/docstore/e37dac7c-f1fa-418c-bf8f-c6527372c7b3 @@ -0,0 +1 @@ +config = types . GenerateImagesConfig ( aspect_ratio = "16:9" , number_of_images = 1 ) ) imagen . generated_images [ 0 ] . image JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const response = await ai . models . generateImages ({ model : "imagen-3.0-generate-002" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , config : { numberOfImages : 1 , }, }); // you'll pass response.generatedImages[0].image.imageBytes to Veo Go package main import ( "context" "fmt" "os" "time" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateImagesConfig { AspectRatio : "16:9" , NumberOfImages : 1 , } response , _ := client . Models . GenerateImages ( ctx , "imagen-3.0-generate-002" , "Panning wide shot of a calico kitten sleeping in the sunshine" , config , ) // you'll pass response.GeneratedImages[0].Image to Veo } Then, generate a video using the resulting image as the first frame: Python operation = client . models . generate_videos ( model = "veo-2.0-generate-001" , prompt = prompt , image = imagen . generated_images [ 0 ] . image , config = types . GenerateVideosConfig ( person_generation = "dont_allow" , # "dont_allow" or "allow_adult" aspect_ratio = "16:9" , # "16:9" or "9:16" number_of_videos = 2 ), ) # Wait for videos to generate while not operation . done : time . sleep ( 20 ) operation = client . operations . get ( operation ) for n , video in enumerate ( operation . response . generated_videos ): fname = f 'with_image_input { n } .mp4' print ( fname ) client . files . download ( file = video . video ) video . video . save ( fname ) JavaScript import { GoogleGenAI } from "@google/genai" ; import { createWriteStream } from "fs" ; import { Readable } from "stream" ; const ai = new GoogleGenAI ({}); async function main () { // get image bytes from Imagen, as shown above let \ No newline at end of file diff --git a/docstore/e39147ad-2732-4f58-8315-637ed2257be4 b/docstore/e39147ad-2732-4f58-8315-637ed2257be4 new file mode 100644 index 0000000000000000000000000000000000000000..dd3816bb39a638c4aed0e9b1ebae84fb4c4bd269 --- /dev/null +++ b/docstore/e39147ad-2732-4f58-8315-637ed2257be4 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/video-understanding#upload-video Title: Video understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/e396d37e-addc-41bf-a4a5-86c41cfb2e48 b/docstore/e396d37e-addc-41bf-a4a5-86c41cfb2e48 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/e396d37e-addc-41bf-a4a5-86c41cfb2e48 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/e3d3feb7-3d46-4e82-978a-c6e7d9f2575d b/docstore/e3d3feb7-3d46-4e82-978a-c6e7d9f2575d new file mode 100644 index 0000000000000000000000000000000000000000..837590d728cdc0a7d72c1f2039c6b55743e812d1 --- /dev/null +++ b/docstore/e3d3feb7-3d46-4e82-978a-c6e7d9f2575d @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models#imagen-4 Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/e4039ad7-1d35-481c-b719-fe8c744ab28a b/docstore/e4039ad7-1d35-481c-b719-fe8c744ab28a new file mode 100644 index 0000000000000000000000000000000000000000..41e1eeb4628899f8187122cba31b514ade1c909f --- /dev/null +++ b/docstore/e4039ad7-1d35-481c-b719-fe8c744ab28a @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/audio Title: Audio understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/e40b52b6-008c-4d30-9497-6f13261bfff8 b/docstore/e40b52b6-008c-4d30-9497-6f13261bfff8 new file mode 100644 index 0000000000000000000000000000000000000000..0cc531e753b38c557818c1cdd192ff3e6aaad575 --- /dev/null +++ b/docstore/e40b52b6-008c-4d30-9497-6f13261bfff8 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/live-guide#interruptions Title: Live API capabilities guide | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/e413472d-7e16-418a-b6cf-7734bf665a70 b/docstore/e413472d-7e16-418a-b6cf-7734bf665a70 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/e413472d-7e16-418a-b6cf-7734bf665a70 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/e43b90f5-76ba-4b75-ad61-6bf68e9b75e3 b/docstore/e43b90f5-76ba-4b75-ad61-6bf68e9b75e3 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/e43b90f5-76ba-4b75-ad61-6bf68e9b75e3 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/e43efa36-690d-49aa-a5e6-697a50a094ae b/docstore/e43efa36-690d-49aa-a5e6-697a50a094ae new file mode 100644 index 0000000000000000000000000000000000000000..f768002e22e546af8fbd249f6201ab1a1006d078 --- /dev/null +++ b/docstore/e43efa36-690d-49aa-a5e6-697a50a094ae @@ -0,0 +1 @@ +const data = response . candidates ? .[ 0 ] ? . content ? . parts ? .[ 0 ] ? . inlineData ? . data ; const audioBuffer = Buffer . from ( data , 'base64' ); const fileName = 'out.wav' ; await saveWaveFile ( fileName , audioBuffer ); } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -X POST \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts":[{ "text": "Say cheerfully: Have a wonderful day!" }] }], "generationConfig": { "responseModalities": ["AUDIO"], "speechConfig": { "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Kore" } } } }, "model": "gemini-2.5-flash-preview-tts", }' | jq -r '.candidates[0].content.parts[0].inlineData.data' | \ base64 --decode >out.pcm # You may need to install ffmpeg. ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm out.wav Multi-speaker text-to-speech For multi-speaker audio, you'll need a MultiSpeakerVoiceConfig object with each speaker (up to 2) configured as a SpeakerVoiceConfig . You'll need to define each speaker with the same names used in the prompt : Python from google import genai from google.genai import types import wave # Set up the wave file to save the output: def wave_file ( filename , pcm , channels = 1 , rate = 24000 , sample_width = 2 ): with wave . open ( filename , "wb" ) as wf : wf . setnchannels ( channels ) wf . setsampwidth ( sample_width ) wf . setframerate ( rate ) wf . writeframes ( pcm ) client = genai . Client () prompt = """TTS the following conversation between Joe and Jane: Joe: How's it going today Jane? Jane: Not too bad, how about you?""" response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = prompt , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( multi_speaker_voice_config = types . MultiSpeakerVoiceConfig ( speaker_voice_configs = [ types . \ No newline at end of file diff --git a/docstore/e443aaf1-7750-44af-bc0e-e6b9257c8952 b/docstore/e443aaf1-7750-44af-bc0e-e6b9257c8952 new file mode 100644 index 0000000000000000000000000000000000000000..750217269b5a3e53dd2d92de822b68418a6799df --- /dev/null +++ b/docstore/e443aaf1-7750-44af-bc0e-e6b9257c8952 @@ -0,0 +1 @@ +' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Get metadata for a file You can verify that the API successfully stored the uploaded file and get its metadata by calling files.get . Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) file_name = myfile . name myfile = client . files . get ( name = file_name ) print ( myfile ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; const fetchedFile = await ai . files . get ({ name : fileName }); console . log ( fetchedFile ); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } gotFile , err := client . GetFile ( ctx , file . Name ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Got file:" , gotFile . Name ) REST # file_info.json was created in the upload example name = $( jq ".file.name" file_info.json ) # Get the file of interest to check state curl https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " > file_info.json # Print some information about the file you got name = $( jq ".file.name" file_info.json ) echo name = $name file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri List uploaded files You can upload multiple files using the Files API. The following code gets a list of all the files uploaded: Python print ( 'My files:' ) for f in client . files . list (): print ( ' ' , f . name ) JavaScript const listResponse = await ai . files . list ({ config : { pageSize : 10 } }); for await ( const file of listResponse ) { console . log ( file . name ); } Go iter := client . ListFiles ( ctx ) for { ifile , err := iter . Next () if err == iterator . Done { break } if err != nil { log . Fatal ( err ) } fmt . Println ( ifile . Name ) } REST echo "My files: " curl \ No newline at end of file diff --git a/docstore/e460c55a-e27d-45e4-bdcc-b1a7b5283ceb b/docstore/e460c55a-e27d-45e4-bdcc-b1a7b5283ceb new file mode 100644 index 0000000000000000000000000000000000000000..9670cc28c51e28ab1129c38367a73e00f33ff60c --- /dev/null +++ b/docstore/e460c55a-e27d-45e4-bdcc-b1a7b5283ceb @@ -0,0 +1 @@ +meal features brown rice, roasted vegetables, and chicken teriyaki. The brown rice is a healthy and complex carbohydrate that will give you sustained energy. The roasted vegetables are a great way to get your daily dose of vitamins and minerals, and the chicken teriyaki is a delicious and protein-rich option. This meal is also very easy to prepare. Simply cook the brown rice, roast the vegetables, and cook the chicken teriyaki. Once everything is cooked, divide it into meal prep containers and store them in the refrigerator. You can then grab a container and go on busy mornings! If you're looking for a healthy and delicious meal that can be easily prepped ahead of time, this meal is a great option. It's packed with nutrients and flavor, and it's sure to keep you feeling full and satisfied. Here's to healthy and delicious meal prepping! If you are having trouble getting the output you want from prompts that use media files, there are some strategies that can help you get the results you want. The following sections provide design approaches and troubleshooting tips for improving prompts that use multimodal input. You can improve your multimodal prompts by following these best practices: Prompt design fundamentals Be specific in your instructions : Craft clear and concise instructions that leave minimal room for misinterpretation. Add a few examples to your prompt: Use realistic few-shot examples to illustrate what you want to achieve. Break it down step-by-step : Divide complex tasks into manageable sub-goals, guiding the model through the process. Specify the output format : In your prompt, ask for the output to be in the format you want, like markdown, JSON, HTML and more. Put your image first for single-image prompts : While Gemini can handle image and text inputs in any order, for prompts containing a single image, it might perform better if that image (or video) is placed before the text prompt. However, for prompts that require images to be highly interleaved \ No newline at end of file diff --git a/docstore/e4691180-40ca-42e8-b226-3f2cd4761b95 b/docstore/e4691180-40ca-42e8-b226-3f2cd4761b95 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/e4691180-40ca-42e8-b226-3f2cd4761b95 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/e46b3e17-4278-4f12-bfc9-f0cfa3d0bc5f b/docstore/e46b3e17-4278-4f12-bfc9-f0cfa3d0bc5f new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/e46b3e17-4278-4f12-bfc9-f0cfa3d0bc5f @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/e46e314c-49f6-4bb9-912f-d4281bb790a2 b/docstore/e46e314c-49f6-4bb9-912f-d4281bb790a2 new file mode 100644 index 0000000000000000000000000000000000000000..640eb117a657d0572b9b52109e4dbbb83222f720 --- /dev/null +++ b/docstore/e46e314c-49f6-4bb9-912f-d4281bb790a2 @@ -0,0 +1,3 @@ +URL: https://ai.google.dev/gemini-api/docs/tokens Title: Understand and count tokens | Gemini API | Google AI for Developers ================================================== + +Understand and count tokens | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Understand and count tokens Python JavaScript Go Gemini and other generative AI models process input and output at a granularity called a token . About tokens Tokens can be single characters like z or whole words like cat . Long words are broken up into several tokens. The set of all tokens used by the model is called the vocabulary, and the process of splitting text into tokens is called tokenization . For Gemini models, a token is equivalent to about 4 characters. 100 tokens is equal to about 60-80 English words. When billing is enabled, the cost of a call to the Gemini API is determined in part by the number of input and output tokens, so knowing how to count tokens can be helpful. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/e4724c8f-de6b-44dd-a81a-41e23dd5c034 b/docstore/e4724c8f-de6b-44dd-a81a-41e23dd5c034 new file mode 100644 index 0000000000000000000000000000000000000000..03e11f1b3b2cd84e15c6098d543ad30ece4e0a72 --- /dev/null +++ b/docstore/e4724c8f-de6b-44dd-a81a-41e23dd5c034 @@ -0,0 +1 @@ +Friday." }, ], response_format = CalendarEvent , ) print ( completion . choices [ 0 ] . message . parsed ) JavaScript import OpenAI from "openai" ; import { zodResponseFormat } from "openai/helpers/zod" ; import { z } from "zod" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai" }); const CalendarEvent = z . object ({ name : z . string (), date : z . string (), participants : z . array ( z . string ()), }); const completion = await openai . beta . chat . completions . parse ({ model : "gemini-2.0-flash" , messages : [ { role : "system" , content : "Extract the event information." }, { role : "user" , content : "John and Susan are going to an AI conference on Friday" }, ], response_format : zodResponseFormat ( CalendarEvent , "event" ), }); const event = completion . choices [ 0 ]. message . parsed ; console . log ( event ); Embeddings Text embeddings measure the relatedness of text strings and can be generated using the Gemini API . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . embeddings . create ( input = "Your text string goes here" , model = "text-embedding-004" ) print ( response . data [ 0 ] . embedding ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function main () { const embedding = await openai . embeddings . create ({ model : "text-embedding-004" , input : "Your text string goes here" , }); console . log ( embedding ); } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/embeddings" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "input": "Your text string goes here", "model": "text-embedding-004" }' extra_body There are several features supported by Gemini that \ No newline at end of file diff --git a/docstore/e472a712-5c6b-402d-b659-9bf58edbc7b2 b/docstore/e472a712-5c6b-402d-b659-9bf58edbc7b2 new file mode 100644 index 0000000000000000000000000000000000000000..45c046a450410d0d7cea0863f584c81b40ede6bc --- /dev/null +++ b/docstore/e472a712-5c6b-402d-b659-9bf58edbc7b2 @@ -0,0 +1 @@ +Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' , system_instruction = 'you are a story teller for kids under 5 years old' , generation_config = genai . GenerationConfig ( max_output_tokens = 400 , top_k = 2 , top_p = 0.5 , temperature = 0.5 , response_mime_type = 'application/json' , stop_sequences = [ ' \n ' ], ) ) response = model . generate_content ( 'tell me a story in 100 words' ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" , generationConfig : { candidateCount : 1 , stopSequences : [ "x" ], maxOutputTokens : 20 , temperature : 1.0 , }, }); const result = await model . generateContent ( "Tell me a story about a magic backpack." , ); console . log ( result . response . text ()) Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( "gemini-1.5-flash" ) model . SetTemperature ( 0.5 ) model . SetTopP ( 0.5 ) model . SetTopK ( 2.0 ) model . SetMaxOutputTokens ( 100 ) model . ResponseMIMEType = "application/json" resp , err := model . GenerateContent ( ctx , genai . Text ( "Tell me about New York" )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) // utility for printing response After Python For all methods in the new SDK, the required arguments are provided as keyword arguments. All optional inputs are provided in the config argument. Config arguments can be specified as either Python dictionaries or Config classes in the google.genai.types namespace. For utility and uniformity, all definitions within the types module are pydantic classes. from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = \ No newline at end of file diff --git a/docstore/e49c7648-ed05-44e7-92e5-4dc9173916a9 b/docstore/e49c7648-ed05-44e7-92e5-4dc9173916a9 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/e49c7648-ed05-44e7-92e5-4dc9173916a9 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/e4c9f3c9-a575-40e4-9dd8-99c0be5706df b/docstore/e4c9f3c9-a575-40e4-9dd8-99c0be5706df new file mode 100644 index 0000000000000000000000000000000000000000..d028e190ded4732dff341fe647fe7532c210272b --- /dev/null +++ b/docstore/e4c9f3c9-a575-40e4-9dd8-99c0be5706df @@ -0,0 +1 @@ +second image shows an empty hallway in a shopping mall, which is a place where people often go to buy things for their trips. The third image shows a stack of toilet paper, which is something that people often use when they are traveling. The fourth image shows a dark and stormy sea, which is something that people often encounter when they are traveling. The same tips work for creative output from images. In the following example with the rabbit, explicitly asking the model to include aspects from the image can improve the output. Prompt Model response Can you write me a poem about this image? Little bunny, so soft and brown, Hopping through the field, up and down. Your ears so long, your tail so small, You're the cutest creature of them all. You nibble on the grass, so green and sweet, And hop away when you see me. But I'll always be your friend, Little bunny, until the very end. Updated prompt Improved response Can you write me a descriptive and dramatic poem about this image and include the location? Little bunny on the green, Ears so long, so keen. What do you see out there, In the distance, where you stare? Is it the ocean blue, Sparkling in the sun's hue? Or the waves that crash and roar, A rhythm that you adore? Little bunny, tell me true, What captures your attention so? Is it the salty breeze, Or the seagulls' cries that please? Perhaps it's the vastness of it all, A world so big and small. A place where dreams can roam, And adventures can be sown. Little bunny, I wonder what you dream, As you sit on the grass, so serene. Do you yearn to explore the deep, Or stay on land, where you can leap? Whatever it may be, little bunny, Keep that spark of wonder burning brightly. For in your dreams and aspirations, Lies a world waiting for your creation. Troubleshooting which part of the prompt failed It can be hard to know whether a prompt failed because the model didn't understand the image to begin with, or if it did understand the image but did not perform the \ No newline at end of file diff --git a/docstore/e4cb18be-8d6b-43b7-be13-6ae571114f60 b/docstore/e4cb18be-8d6b-43b7-be13-6ae571114f60 new file mode 100644 index 0000000000000000000000000000000000000000..f8a8dbefb397fad1ce4cef8dc284e6d47583724c --- /dev/null +++ b/docstore/e4cb18be-8d6b-43b7-be13-6ae571114f60 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/code-execution#main-content Title: Code execution | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/e4f23f51-746a-49f3-a5c1-a4c883d6656a b/docstore/e4f23f51-746a-49f3-a5c1-a4c883d6656a new file mode 100644 index 0000000000000000000000000000000000000000..987408a9d5fce99ac4990dd87d63b0d860b9553d --- /dev/null +++ b/docstore/e4f23f51-746a-49f3-a5c1-a4c883d6656a @@ -0,0 +1 @@ +overwhelming dominance of digital streaming and on-demand services; a "smart" approach would therefore require a highly specialized niche, a strong value proposition beyond just the disc, and likely a focus on community or collector markets rather than mainstream consumption. Here's a breakdown of what a smart DVD business in 2025 would likely entail: 1. Extreme Niche Specialization: You cannot compete with streaming libraries. Your business must focus on what digital doesn't offer easily or at all. This could include: * Out-of-Print & Rare Titles: Sourcing films or TV shows that have never been released digitally or are currently unavailable on major streaming platforms. .... (gemini-2.5-flash) Format responses with the completion strategy The completion strategy can also help format the response. The following example prompts the model to create an essay outline: Prompt: Create an outline for an essay about hummingbirds. Response: Here is a possible outline for an essay about hummingbirds: I. Introduction A. Hook: Start with a captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). B. Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds. C. Thesis Statement: State the main point of the essay, which could be about the remarkable adaptations, behaviors, and ecological importance of hummingbirds. II. Appearance and Identification ..... (gemini-2.5-flash) The prompt didn't specify the format for the outline and the model chose a format for you. To get the model to return an outline in a specific format, you can add text that represents the start of the outline and let the model complete it based on the pattern that you initiated. Prompt: Create an outline for an essay about hummingbirds. I. Introduction * Response: Okay, here is an outline for an essay about hummingbirds, starting from the introduction you provided: I. Introduction * Hook: Start with a \ No newline at end of file diff --git a/docstore/e4fb811c-cd18-4d86-8c25-fb4417163471 b/docstore/e4fb811c-cd18-4d86-8c25-fb4417163471 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/e4fb811c-cd18-4d86-8c25-fb4417163471 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/e5083316-a335-4e08-a88b-309577774818 b/docstore/e5083316-a335-4e08-a88b-309577774818 new file mode 100644 index 0000000000000000000000000000000000000000..5b31a2c588785b0dc19769f45b0589a09f2843d3 --- /dev/null +++ b/docstore/e5083316-a335-4e08-a88b-309577774818 @@ -0,0 +1 @@ +world knowledge and reasoning. Seamlessly blending text and images is important. You want accurate visuals embedded within long text sequences. You want to edit images conversationally while maintaining context. Choose Imagen when: Image quality, photorealism, artistic detail, or specific styles (e.g., impressionism, anime) are top priorities. Performing specialized editing tasks like product background updates or image upscaling. Infusing branding, style, or generating logos and product designs. Imagen 4 should be your go-to model starting to generate images with Imagen. Choose Imagen 4 Ultra for advanced use-cases or when you need the best image quality. Note that Imagen 4 Ultra can only generate one image at a time. Imagen prompt guide This section of the Imagen guide shows you how modifying a text-to-image prompt can produce different results, along with examples of images you can create. Prompt writing basics Note: Maximum prompt length is 480 tokens. A good prompt is descriptive and clear, and makes use of meaningful keywords and modifiers. Start by thinking of your subject , context , and style . Image text: A sketch ( style ) of a modern apartment building ( subject ) surrounded by skyscrapers ( context and background ). Subject : The first thing to think about with any prompt is the subject : the object, person, animal, or scenery you want an image of. Context and background: Just as important is the background or context in which the subject will be placed. Try placing your subject in a variety of backgrounds. For example, a studio with a white background, outdoors, or indoor environments. Style: Finally, add the style of image you want. Styles can be general (painting, photograph, sketches) or very specific (pastel painting, charcoal drawing, isometric 3D). You can also combine styles. After you write a first version of your prompt, refine your prompt by adding more details until you get to the image that you want. Iteration is important. Start by \ No newline at end of file diff --git a/docstore/e5136a36-1a2d-4ac5-9aa0-ace246a679f8 b/docstore/e5136a36-1a2d-4ac5-9aa0-ace246a679f8 new file mode 100644 index 0000000000000000000000000000000000000000..6753b1d43ae21393f7ea777aad5afc4becddf540 --- /dev/null +++ b/docstore/e5136a36-1a2d-4ac5-9aa0-ace246a679f8 @@ -0,0 +1 @@ +response will include a thought_signature field containing an encrypted representation of the model's reasoning. Return the signature: When you send the function's execution result back to the server, include the thought_signature you received. This allows the model to restore its previous thinking context and will likely result in better function calling performance. Receiving signatures from the server Signatures are returned in the part after the model's thinking phase, which typically is a text or function call. Here are some examples of what thought signatures look like returned in each type of part, in response to the request "What's the weather in Lake Tahoe?" using the Get Weather example: Text part [{ "candidates" : [ { "content" : { "parts" : [ { "text" : "Here's what the weather in Lake Tahoe is today" , "thoughtSignature" : "ClcBVKhc7ru7KzUI7SrdUoIdAYLm/+i93aHjfIt4xHyAoO/G70tApxnK2ujBhOhC1PrRy1pkQa88fqFvpHNVd1HDjNLO7mkp6/hFwE+SPPEB3fh0hs4oM8MKhgIBVKhc7uIGvrS7i/T4HpfbnYrluFfWNjZ62gewqe4cVdR/Dlh+zbjtYmDD0gPZ+SuBO7vvHQdzsjePRP+2Y5XddX6LEf/cGGgakq8EhVvw/a6IVzUO6XmpHg2Ag1sl8E9+VFH/lC0R0ZuYdFWligtDuYwp5p5q3o59G0TtWeU2MC1y2MJfE9u/KWd313ldka80/X2W/xF2O/4djMp5G2WKcULfve75zeRCy0mc5iS3SB9mTH0cT6x0vtKjeBx50gcg+CQWtJcRuwTVzz54dmvmK9xvnqA8gKGw3DuaM9wfy5hyY7Qg0z3iyyWdP8T/lbjKim8IEQOk7O1vVwP1Ko7oMYH8JgA1CsoBAVSoXO6v4c5RSyd1cn6EIU0pEFQsjW7rYWPuZdOFq/tsGJT9BCfW7KGkPGwlNSq8jTJFvbcJ/DjtndISQYXwiXd2kGa5JfdS2Kh4zOxCxiWtOk+2nCc3+XQk2nonhO+esGJpkDdbbHZSqRgcUtYKq7q28iPFOQvOFyCiZNB7K86Z/6Hnagu2snSlN/BcTMaFGaWpcCClSUo4foRZn3WbNCoM8rcpD7qEJMp4a5baaSxyyeL1ZTGd2HLpFys/oiW6e3oAnhxuIysCwg==" } ] , "role" : "model" } , "index" : 0 } ] , # Remainder of response... Function call part [{ "candidates" : [ { "content" : { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/e51a7395-e52a-444d-a8bf-0313cb449d8f b/docstore/e51a7395-e52a-444d-a8bf-0313cb449d8f new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/e51a7395-e52a-444d-a8bf-0313cb449d8f @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/e5226d8c-fcde-4512-a251-280556421895 b/docstore/e5226d8c-fcde-4512-a251-280556421895 new file mode 100644 index 0000000000000000000000000000000000000000..750217269b5a3e53dd2d92de822b68418a6799df --- /dev/null +++ b/docstore/e5226d8c-fcde-4512-a251-280556421895 @@ -0,0 +1 @@ +' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Get metadata for a file You can verify that the API successfully stored the uploaded file and get its metadata by calling files.get . Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) file_name = myfile . name myfile = client . files . get ( name = file_name ) print ( myfile ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; const fetchedFile = await ai . files . get ({ name : fileName }); console . log ( fetchedFile ); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } gotFile , err := client . GetFile ( ctx , file . Name ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Got file:" , gotFile . Name ) REST # file_info.json was created in the upload example name = $( jq ".file.name" file_info.json ) # Get the file of interest to check state curl https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " > file_info.json # Print some information about the file you got name = $( jq ".file.name" file_info.json ) echo name = $name file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri List uploaded files You can upload multiple files using the Files API. The following code gets a list of all the files uploaded: Python print ( 'My files:' ) for f in client . files . list (): print ( ' ' , f . name ) JavaScript const listResponse = await ai . files . list ({ config : { pageSize : 10 } }); for await ( const file of listResponse ) { console . log ( file . name ); } Go iter := client . ListFiles ( ctx ) for { ifile , err := iter . Next () if err == iterator . Done { break } if err != nil { log . Fatal ( err ) } fmt . Println ( ifile . Name ) } REST echo "My files: " curl \ No newline at end of file diff --git a/docstore/e54508bc-0c2a-4041-9793-082a69e1f9be b/docstore/e54508bc-0c2a-4041-9793-082a69e1f9be new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/e54508bc-0c2a-4041-9793-082a69e1f9be @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/e55f9c93-e144-404e-85f7-a1433e4722dc b/docstore/e55f9c93-e144-404e-85f7-a1433e4722dc new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/e55f9c93-e144-404e-85f7-a1433e4722dc @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/e5613f3d-e4f6-4c1f-a352-6b8bd3143574 b/docstore/e5613f3d-e4f6-4c1f-a352-6b8bd3143574 new file mode 100644 index 0000000000000000000000000000000000000000..cb7319bb995c708a315638a1177ba448f5c39210 --- /dev/null +++ b/docstore/e5613f3d-e4f6-4c1f-a352-6b8bd3143574 @@ -0,0 +1 @@ +: "Powers the spinning disco ball." , "parameters" : { "type" : "object" , "properties" : { "power" : { "type" : "boolean" , "description" : "Whether to turn the disco ball on or off." , } }, "required" : [ "power" ], }, } start_music = { "name" : "start_music" , "description" : "Play some music matching the specified parameters." , "parameters" : { "type" : "object" , "properties" : { "energetic" : { "type" : "boolean" , "description" : "Whether the music is energetic or not." , }, "loud" : { "type" : "boolean" , "description" : "Whether the music is loud or not." , }, }, "required" : [ "energetic" , "loud" ], }, } dim_lights = { "name" : "dim_lights" , "description" : "Dim the lights." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "number" , "description" : "The brightness of the lights, 0.0 is off, 1.0 is full." , } }, "required" : [ "brightness" ], }, } JavaScript import { Type } from '@google/genai' ; const powerDiscoBall = { name : 'power_disco_ball' , description : 'Powers the spinning disco ball.' , parameters : { type : Type . OBJECT , properties : { power : { type : Type . BOOLEAN , description : 'Whether to turn the disco ball on or off.' } }, required : [ 'power' ] } }; const startMusic = { name : 'start_music' , description : 'Play some music matching the specified parameters.' , parameters : { type : Type . OBJECT , properties : { energetic : { type : Type . BOOLEAN , description : 'Whether the music is energetic or not.' }, loud : { type : Type . BOOLEAN , description : 'Whether the music is loud or not.' } }, required : [ 'energetic' , 'loud' ] } }; const dimLights = { name : 'dim_lights' , description : 'Dim the lights.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'The brightness of the lights, 0.0 is off, 1.0 is full.' } }, required : [ 'brightness' ] } }; Configure the function calling mode to allow using all of the specified tools. To learn more, \ No newline at end of file diff --git a/docstore/e578aebf-23e6-4610-abb9-86a39b863bae b/docstore/e578aebf-23e6-4610-abb9-86a39b863bae new file mode 100644 index 0000000000000000000000000000000000000000..5a2b8a2c7253b87796a50aae4616794fa40b7018 --- /dev/null +++ b/docstore/e578aebf-23e6-4610-abb9-86a39b863bae @@ -0,0 +1 @@ +( 'Opened' ); }, onmessage : function ( message ) { console . debug ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send content... session . close (); } main (); Note: You can only set one modality in the response_modalities field. This means that you can configure the model to respond with either text or audio, but not both in the same session. Interaction modalities The following sections provide examples and supporting context for the different input and output modalities available in Live API. Sending and receiving text Here's how you can send and receive text: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello, how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . text is not None : print ( response . text , end = "" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . \ No newline at end of file diff --git a/docstore/e58bbf11-cf90-4b31-ae63-28c6b83dbcc8 b/docstore/e58bbf11-cf90-4b31-ae63-28c6b83dbcc8 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/e58bbf11-cf90-4b31-ae63-28c6b83dbcc8 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/e58e0cc7-5c3e-4bfa-b055-282c99df4f4d b/docstore/e58e0cc7-5c3e-4bfa-b055-282c99df4f4d new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/e58e0cc7-5c3e-4bfa-b055-282c99df4f4d @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/e5c2806b-69f9-4222-89c5-842c36df3206 b/docstore/e5c2806b-69f9-4222-89c5-842c36df3206 new file mode 100644 index 0000000000000000000000000000000000000000..7c01a8e52c6f77d499351d1468f50bccd1328e77 --- /dev/null +++ b/docstore/e5c2806b-69f9-4222-89c5-842c36df3206 @@ -0,0 +1 @@ +function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); // Combine audio data strings and save as wave file const combinedAudio = turns . reduce (( acc , turn ) = > { if ( turn . data ) { const buffer = Buffer . from ( turn . data , 'base64' ); const intArray = new Int16Array ( buffer . buffer , buffer . byteOffset , buffer . byteLength / Int16Array . BYTES_PER_ELEMENT ); return acc . concat ( Array . from ( intArray )); } return acc ; }, []); const audioBuffer = new Int16Array ( combinedAudio ); const wf = new WaveFile (); wf . fromScratch ( 1 , 24000 , '16' , audioBuffer ); fs . writeFileSync ( 'output.wav' , wf . toBuffer ()); session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Audio formats Audio data in the Live API is always raw, little-endian, 16-bit PCM. Audio output always uses a sample rate of 24kHz. Input audio is natively 16kHz, but the Live API will resample if needed so any sample rate can be sent. To convey the sample rate of input audio, set the MIME type of each audio-containing Blob to a value like audio/pcm;rate=16000 . Audio transcriptions You can enable transcription of the model's audio output by sending output_audio_transcription in the setup config. The transcription language is inferred from the model's response. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ], "output_audio_transcription" : {} } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello? Gemini are you there?" await session . send_client_content ( turns = { \ No newline at end of file diff --git a/docstore/e5cf3fc1-a16c-4895-a801-d7407ca136d5 b/docstore/e5cf3fc1-a16c-4895-a801-d7407ca136d5 new file mode 100644 index 0000000000000000000000000000000000000000..f65bfb5d195a3160683160d98bf38afd321eba5f --- /dev/null +++ b/docstore/e5cf3fc1-a16c-4895-a801-d7407ca136d5 @@ -0,0 +1 @@ +Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/e5dbb4e7-d8e3-403c-9c42-e07784fa98a7 b/docstore/e5dbb4e7-d8e3-403c-9c42-e07784fa98a7 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/e5dbb4e7-d8e3-403c-9c42-e07784fa98a7 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/e5e26e21-3bc2-46b6-9d12-2cbbbc91e19e b/docstore/e5e26e21-3bc2-46b6-9d12-2cbbbc91e19e new file mode 100644 index 0000000000000000000000000000000000000000..d461fa063b68f8660dce0a1cea933e5b74e5f278 --- /dev/null +++ b/docstore/e5e26e21-3bc2-46b6-9d12-2cbbbc91e19e @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/libraries#previous-sdks Title: Gemini API libraries | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/e5e2dcef-f262-4bae-8d9b-ee32198595b5 b/docstore/e5e2dcef-f262-4bae-8d9b-ee32198595b5 new file mode 100644 index 0000000000000000000000000000000000000000..40517314fd91c121847408df8a1f7fc600adf0b3 --- /dev/null +++ b/docstore/e5e2dcef-f262-4bae-8d9b-ee32198595b5 @@ -0,0 +1 @@ +string, "nullable": boolean, "enum": [ string ], "maxItems": integer, "minItems": integer, "properties": { string: { object (Schema) }, ... }, "required": [ string ], "propertyOrdering": [ string ], "items": { object (Schema) } } The Type of the schema must be one of the OpenAPI Data Types , or a union of those types (using anyOf ). Only a subset of fields is valid for each Type . The following list maps each Type to a subset of the fields that are valid for that type: string -> enum , format , nullable integer -> format , minimum , maximum , enum , nullable number -> format , minimum , maximum , enum , nullable boolean -> nullable array -> minItems , maxItems , items , nullable object -> properties , required , propertyOrdering , nullable Here are some example schemas showing valid type-and-field combinations: { "type" : "string" , "enum" : [ "a" , "b" , "c" ] } { "type" : "string" , "format" : "date-time" } { "type" : "integer" , "format" : "int64" } { "type" : "number" , "format" : "double" } { "type" : "boolean" } { "type" : "array" , "minItems" : 3 , "maxItems" : 3 , "items" : { "type" : ... } } { "type" : "object" , "properties" : { "a" : { "type" : ... }, "b" : { "type" : ... }, "c" : { "type" : ... } }, "nullable" : true , "required" : [ "c" ], "propertyOrdering" : [ "c" , "b" , "a" ] } For complete documentation of the Schema fields as they're used in the Gemini API, see the Schema reference . Property ordering Warning: When you're configuring a JSON schema, make sure to set propertyOrdering[] , and when you provide examples, make sure that the property ordering in the examples matches the schema. When you're working with JSON schemas in the Gemini API, the order of properties is important. By default, the API orders properties alphabetically and does not preserve the order in which the properties are defined (although the Google Gen AI SDKs may preserve this order). If you're providing examples to the model with a schema configured, and the property \ No newline at end of file diff --git a/docstore/e5e7bcb2-694d-411c-950e-0f3adf644547 b/docstore/e5e7bcb2-694d-411c-950e-0f3adf644547 new file mode 100644 index 0000000000000000000000000000000000000000..e83e80a217e9ff779ee42e75d11ac96b01a2f185 --- /dev/null +++ b/docstore/e5e7bcb2-694d-411c-950e-0f3adf644547 @@ -0,0 +1 @@ +function_calling_config = types . FunctionCallingConfig ( mode = "ANY" , allowed_function_names = [ "get_current_temperature" ] ) ) # Create the generation config config = types . GenerateContentConfig ( tools = [ tools ], # not defined here. tool_config = tool_config , ) JavaScript import { FunctionCallingConfigMode } from '@google/genai' ; // Configure function calling mode const toolConfig = { functionCallingConfig : { mode : FunctionCallingConfigMode . ANY , allowedFunctionNames : [ 'get_current_temperature' ] } }; // Create the generation config const config = { tools : tools , // not defined here. toolConfig : toolConfig , }; Automatic function calling (Python only) When using the Python SDK, you can provide Python functions directly as tools. The SDK automatically converts the Python function to declarations, handles the function call execution and the response cycle for you. The Python SDK then automatically: Detects function call responses from the model. Call the corresponding Python function in your code. Sends the function response back to the model. Returns the model's final text response. To use this, define your function with type hints and a docstring, and then pass the function itself (not a JSON declaration) as a tool: Python from google import genai from google.genai import types # Define the function with type hints and docstring def get_current_temperature ( location : str ) - > dict : """Gets the current temperature for a given location. Args: location: The city and state, e.g. San Francisco, CA Returns: A dictionary containing the temperature and unit. """ # ... (implementation) ... return { "temperature" : 25 , "unit" : "Celsius" } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_current_temperature ] ) # Pass the function itself # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What's the temperature in Boston?" , config = config \ No newline at end of file diff --git a/docstore/e607ee5d-077d-470d-a59a-11227d366eeb b/docstore/e607ee5d-077d-470d-a59a-11227d366eeb new file mode 100644 index 0000000000000000000000000000000000000000..6acfe7ba02c229c83724a8d4179d85fae6727e02 --- /dev/null +++ b/docstore/e607ee5d-077d-470d-a59a-11227d366eeb @@ -0,0 +1 @@ +images before performing its reasoning task. Prompt Model response What is in common between these images? All four images are examples of landscape photography. Updated prompt Improved response First, describe what's in each image in detail. What's in common between these images? The first image shows a person packing a suitcase. The second image shows an empty hallway in a shopping mall. The third image shows a roll of toilet paper. The fourth image shows a dark and stormy sea. The commonality between these images is that they are all examples of liminal spaces. Liminal spaces are places or situations that are in between two other places or situations. They are often characterized by a sense of uncertainty or ambiguity. The first image shows the person in between being at home and being on a trip. The second image shows the hallway in between two stores. The third image is of toilet paper, which can represent a bathroom. A bathroom is a transitional space between private and public realms. It's a place where we prepare ourselves to enter or leave the outside world. The fourth image shows the sea in between being calm and being stormy. Liminal spaces can be both exciting and unsettling. They can be a place of new beginning or a place of endings. They can be a place of reflection or a place of action. You can also try explicitly asking the model to refer to what's in the image. Updated prompt Improved response What is in common between these images? Refer to what's in the images in your response. All of the images are related to travel. The first image shows a person packing a suitcase, the second image shows an empty hallway in a shopping mall, the third image shows a stack of toilet paper, and the fourth image shows a dark and stormy sea. These images are all related to travel because they are all things that people need or use when they travel. The first image shows a person packing a suitcase, which is something that people do when they are going on a trip. The \ No newline at end of file diff --git a/docstore/e62010f8-6f4e-4403-ba9b-0055adcee55e b/docstore/e62010f8-6f4e-4403-ba9b-0055adcee55e new file mode 100644 index 0000000000000000000000000000000000000000..71a40ddf8f5869779c79b3a864f86f411dc17d60 --- /dev/null +++ b/docstore/e62010f8-6f4e-4403-ba9b-0055adcee55e @@ -0,0 +1 @@ +correct reasoning steps afterward. To disambiguate between those reasons, ask the model to describe what's in the image. In the following example, if the model responds with a snack that seems surprising when paired with tea (e.g. popcorn), you can first troubleshoot to determine whether the model correctly recognized that the image contains tea. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? Describe what's in this image. Another strategy is to ask the model to explain its reasoning. That can help you narrow down which part of the reasoning broke down, if any. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? What's a snack I can make in 1 minute that would go well with this? Please explain why. What's next Try writing your own multimodal prompts using Google AI Studio . For information on using the Gemini Files API for uploading media files and including them in your prompts, see the Vision , Audio , and Document processing guides. For more guidance on prompt design, like tuning sampling parameters, see the Prompt strategies page. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/e654c941-a210-4588-aea7-180a5e897b79 b/docstore/e654c941-a210-4588-aea7-180a5e897b79 new file mode 100644 index 0000000000000000000000000000000000000000..987408a9d5fce99ac4990dd87d63b0d860b9553d --- /dev/null +++ b/docstore/e654c941-a210-4588-aea7-180a5e897b79 @@ -0,0 +1 @@ +overwhelming dominance of digital streaming and on-demand services; a "smart" approach would therefore require a highly specialized niche, a strong value proposition beyond just the disc, and likely a focus on community or collector markets rather than mainstream consumption. Here's a breakdown of what a smart DVD business in 2025 would likely entail: 1. Extreme Niche Specialization: You cannot compete with streaming libraries. Your business must focus on what digital doesn't offer easily or at all. This could include: * Out-of-Print & Rare Titles: Sourcing films or TV shows that have never been released digitally or are currently unavailable on major streaming platforms. .... (gemini-2.5-flash) Format responses with the completion strategy The completion strategy can also help format the response. The following example prompts the model to create an essay outline: Prompt: Create an outline for an essay about hummingbirds. Response: Here is a possible outline for an essay about hummingbirds: I. Introduction A. Hook: Start with a captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). B. Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds. C. Thesis Statement: State the main point of the essay, which could be about the remarkable adaptations, behaviors, and ecological importance of hummingbirds. II. Appearance and Identification ..... (gemini-2.5-flash) The prompt didn't specify the format for the outline and the model chose a format for you. To get the model to return an outline in a specific format, you can add text that represents the start of the outline and let the model complete it based on the pattern that you initiated. Prompt: Create an outline for an essay about hummingbirds. I. Introduction * Response: Okay, here is an outline for an essay about hummingbirds, starting from the introduction you provided: I. Introduction * Hook: Start with a \ No newline at end of file diff --git a/docstore/e68791e6-1e5b-4c7e-b88a-f0f74f82a181 b/docstore/e68791e6-1e5b-4c7e-b88a-f0f74f82a181 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/e68791e6-1e5b-4c7e-b88a-f0f74f82a181 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/e696a4ef-ffa4-43c8-97ce-497727ccdc81 b/docstore/e696a4ef-ffa4-43c8-97ce-497727ccdc81 new file mode 100644 index 0000000000000000000000000000000000000000..a6b24f2239a055f1d65a9fc8ecf3b6c3213634db --- /dev/null +++ b/docstore/e696a4ef-ffa4-43c8-97ce-497727ccdc81 @@ -0,0 +1 @@ +min | 12/2/2023 | 12/4/2023 | | Physics | In progress | 2 hrs | 12/3/2023 | 12/5/2023 | | Biology | Not started | 6 hrs | 11/2/2016 | 12/6/2023 | Prompt Model response Provide a list of all the following attributes: ingredients, type of cuisine, vegetarian or not, in JSON format ```json { "ingredients": [ "rice", "shrimp", "clams", "mussels", "peas", "tomatoes", "onions", "garlic", "olive oil", "paprika", "salt", "pepper" ], "type of cuisine": "Spanish", "vegetarian": false } ``` Put your image first for single-image prompts While Gemini can interpret image and text in any order within a prompt, placing a single image before the text prompt might lead to better results. In the following samples, you'll notice the image comes first before the text in each prompt. Troubleshooting your multimodal prompt You might need to troubleshoot your prompt if you are not getting a helpful response. Here are a few strategies you could try. If the model is not drawing information from the relevant part of the image To get a more specific response, you can point out which aspects of the image you want the prompt to draw information from to inform its response. Prompt Model response How many days will these diapers last a baby? The diapers will last for 198 days before they run out. Updated prompt Improved response How long will these diapers last before I run out? Use the weight shown on the box to determine the child's age, and use the total number of diapers in the box. Divide the total number by how many diapers the child goes through per day. The box contains 198 diapers. The box states the diapers are for a child weighing 22-37 lbs. The average child in this weight range is likely between 1 and 2 years old. A 2-year-old typically goes through 6 diapers per day, therefore, the diapers will last around 33 days (198/6). If the model output is too generic and not tailored enough to the image input To help the model tailor its response to the image(s), try asking it to describe the \ No newline at end of file diff --git a/docstore/e69b46e2-b865-45f3-93e8-5f52161c0fd3 b/docstore/e69b46e2-b865-45f3-93e8-5f52161c0fd3 new file mode 100644 index 0000000000000000000000000000000000000000..53e5ed0c4b3c9d5f8d129df24753928921198efa --- /dev/null +++ b/docstore/e69b46e2-b865-45f3-93e8-5f52161c0fd3 @@ -0,0 +1 @@ +text ); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -X POST \ -d '{ "contents": [ { "parts": [ {"text": "Who won the euro 2024?"} ] } ], "tools": [ { "google_search": {} } ] }' You can learn more by trying the Search tool notebook . How grounding with Google Search works When you enable the google_search tool, the model handles the entire workflow of searching, processing, and citing information automatically. User Prompt: Your application sends a user's prompt to the Gemini API with the google_search tool enabled. Prompt Analysis: The model analyzes the prompt and determines if a Google Search can improve the answer. Google Search: If needed, the model automatically generates one or multiple search queries and executes them. Search Results Processing: The model processes the search results, synthesizes the information, and formulates a response. Grounded Response: The API returns a final, user-friendly response that is grounded in the search results. This response includes the model's text answer and groundingMetadata with the search queries, web results, and citations. Understanding the Grounding Response When a response is successfully grounded, the response includes a groundingMetadata field. This structured data is essential for verifying claims and building a rich citation experience in your application. { "candidates" : [ { "content" : { "parts" : [ { "text" : "Spain won Euro 2024, defeating England 2-1 in the final. This victory marks Spain's record fourth European Championship title." } ], "role" : "model" }, "groundingMetadata" : { "webSearchQueries" : [ "UEFA Euro 2024 winner" , "who won euro 2024" ], "searchEntryPoint" : { "renderedContent" : "" }, "groundingChunks" : [ { "web" : { "uri" : "https://vertexaisearch.cloud.google.com....." , "title" : "aljazeera.com" }}, { "web" : \ No newline at end of file diff --git a/docstore/e69c19b9-71f5-48b7-9fd6-8102cd878a91 b/docstore/e69c19b9-71f5-48b7-9fd6-8102cd878a91 new file mode 100644 index 0000000000000000000000000000000000000000..54e0e87520cd0ceb7f7706e27988156ccba6e321 --- /dev/null +++ b/docstore/e69c19b9-71f5-48b7-9fd6-8102cd878a91 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#gemini-embedding Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/e6a607d2-98ad-41f0-8f52-57011bca5965 b/docstore/e6a607d2-98ad-41f0-8f52-57011bca5965 new file mode 100644 index 0000000000000000000000000000000000000000..58c080b28fe0c0a0f77a553f5c6816f2c420fdd2 --- /dev/null +++ b/docstore/e6a607d2-98ad-41f0-8f52-57011bca5965 @@ -0,0 +1 @@ +the sum of the first 50 prime numbers? " "Generate and run code for the calculation, and make sure you get all 50." ) for part in response . candidates [ 0 ] . content . parts : if part . text is not None : print ( part . text ) if part . executable_code is not None : print ( part . executable_code . code ) if part . code_execution_result is not None : print ( part . code_execution_result . output ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "I have a math question for you:" }], }, { role : "model" , parts : [{ text : "Great! I'm ready for your math question. Please ask away." }], }, ], config : { tools : [{ codeExecution : {}}], } }); const response = await chat . sendMessage ({ message : "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and make sure you get all 50." }); console . log ( "Chat response:" , response . text ); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { Tools : [] * genai . Tool { { CodeExecution : & genai . ToolCodeExecution {}}, }, } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , config , nil , ) result , _ := chat . SendMessage ( ctx , genai . Part { Text : "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and " + "make sure you get all 50." , }, ) fmt . Println ( result . Text ()) fmt . Println ( result . ExecutableCode ()) fmt . Println ( result . CodeExecutionResult ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{"tools": \ No newline at end of file diff --git a/docstore/e6b36b47-013a-40d4-871b-80c17aafc957 b/docstore/e6b36b47-013a-40d4-871b-80c17aafc957 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/e6b36b47-013a-40d4-871b-80c17aafc957 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/e6c9b4f2-4b7d-4d0d-bd4e-6cd1ad70151e b/docstore/e6c9b4f2-4b7d-4d0d-bd4e-6cd1ad70151e new file mode 100644 index 0000000000000000000000000000000000000000..4781e96bc89cf0be67f0a65c094deb317c17f5b0 --- /dev/null +++ b/docstore/e6c9b4f2-4b7d-4d0d-bd4e-6cd1ad70151e @@ -0,0 +1 @@ +result , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , genai . Text ( "Tell me about New York?" ), nil ) } Other use cases and platforms Refer to use case specific guides on Gemini Developer API Documentation and Vertex AI documentation for other platforms and use cases. Migration considerations When you migrate: You'll need to use Google Cloud service accounts to authenticate. See the Vertex AI documentation for more information. You can use your existing Google Cloud project (the same one you used to generate your API key) or you can create a new Google Cloud project . Supported regions may differ between the Gemini Developer API and the Vertex AI Gemini API. See the list of supported regions for generative AI on Google Cloud . Any models you created in Google AI Studio need to be retrained in Vertex AI. If you no longer need to use your Gemini API key for the Gemini Developer API, then follow security best practices and delete it. To delete an API key: Open the Google Cloud API Credentials page. Find the API key you want to delete and click the Actions icon. Select Delete API key . In the Delete credential modal, select Delete . Deleting an API key takes a few minutes to propagate. After propagation completes, any traffic using the deleted API key is rejected. Important: If you have deleted a key that is still used in production and need to recover it, see gcloud beta services api-keys undelete . Next steps See the Generative AI on Vertex AI overview to learn more about generative AI solutions on Vertex AI. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-26 UTC. \ No newline at end of file diff --git a/docstore/e6d135b0-0c2a-498c-8c46-da6b078bfc6f b/docstore/e6d135b0-0c2a-498c-8c46-da6b078bfc6f new file mode 100644 index 0000000000000000000000000000000000000000..2e8ac78d46cd543ec267658a40dc30e0a2feb077 --- /dev/null +++ b/docstore/e6d135b0-0c2a-498c-8c46-da6b078bfc6f @@ -0,0 +1 @@ +Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { SystemInstruction : genai . NewContentFromText ( "You are a cat. Your name is Neko." , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Hello there" ), config , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "system_instruction": { "parts": [ { "text": "You are a cat. Your name is Neko." } ] }, "contents": [ { "parts": [ { "text": "Hello there" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const systemInstruction = { parts : [{ text : 'You are a cat. Your name is Neko.' }] }; const payload = { systemInstruction , contents : [ { parts : [ { text : 'Hello there' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } The GenerateContentConfig object also lets you override default generation parameters, such as temperature . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model \ No newline at end of file diff --git a/docstore/e6e7066f-24bd-488a-a401-e73099c0db7b b/docstore/e6e7066f-24bd-488a-a401-e73099c0db7b new file mode 100644 index 0000000000000000000000000000000000000000..c72243f466a39b8c76a4237da0daa8b99ecbbe13 --- /dev/null +++ b/docstore/e6e7066f-24bd-488a-a401-e73099c0db7b @@ -0,0 +1 @@ +}, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST # Use a temporary file to hold the base64 encoded image data TEMP_B64 = $( mktemp ) trap 'rm -f "$TEMP_B64"' EXIT base64 $B64FLAGS $IMG_PATH > " $TEMP_B64 " # Use a temporary file to hold the JSON payload TEMP_JSON = $( mktemp ) trap 'rm -f "$TEMP_JSON"' EXIT cat > " $TEMP_JSON " << EOF { "contents" : [ { "parts" : [ { "text" : "Tell me about this instrument" } , { "inline_data" : { "mime_type" : "image/jpeg" , "data" : " $( cat " $TEMP_B64 " ) " } } ] } ] } EOF curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d "@ $TEMP_JSON " Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const imageUrl = 'http://image/url' ; const image = getImageData ( imageUrl ); const payload = { contents : [ { parts : [ { image }, { text : 'Tell me about this instrument' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } function getImageData ( url ) { const blob = UrlFetchApp . fetch ( url ). getBlob (); return { mimeType : blob . getContentType (), data : Utilities . base64Encode ( blob . getBytes ()) }; } \ No newline at end of file diff --git a/docstore/e6e99ac0-27f4-4ee0-b9ce-48c53a6b7360 b/docstore/e6e99ac0-27f4-4ee0-b9ce-48c53a6b7360 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/e6e99ac0-27f4-4ee0-b9ce-48c53a6b7360 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/e7154e47-2514-4b04-8f06-9926bef01907 b/docstore/e7154e47-2514-4b04-8f06-9926bef01907 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/e7154e47-2514-4b04-8f06-9926bef01907 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/e74db7a1-eb7e-4165-b411-6b197f998a85 b/docstore/e74db7a1-eb7e-4165-b411-6b197f998a85 new file mode 100644 index 0000000000000000000000000000000000000000..7e531e47df350a1c55b98c04d16d5e79af210c1a --- /dev/null +++ b/docstore/e74db7a1-eb7e-4165-b411-6b197f998a85 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models#gemini-2.5-pro-preview-tts Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/e74dbfc6-7ccd-4d35-a926-f469d84db8e9 b/docstore/e74dbfc6-7ccd-4d35-a926-f469d84db8e9 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/e74dbfc6-7ccd-4d35-a926-f469d84db8e9 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/e7531406-1aea-43a1-9eb7-165a158b0b33 b/docstore/e7531406-1aea-43a1-9eb7-165a158b0b33 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/e7531406-1aea-43a1-9eb7-165a158b0b33 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/e7582930-9771-4363-bd93-a3b32c43c2c6 b/docstore/e7582930-9771-4363-bd93-a3b32c43c2c6 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/e7582930-9771-4363-bd93-a3b32c43c2c6 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/e77d8811-9e3b-4626-86b4-b6ef8bd15c68 b/docstore/e77d8811-9e3b-4626-86b4-b6ef8bd15c68 new file mode 100644 index 0000000000000000000000000000000000000000..122c682d2774097387ed4735af082d43f98d76f5 --- /dev/null +++ b/docstore/e77d8811-9e3b-4626-86b4-b6ef8bd15c68 @@ -0,0 +1 @@ +regions. Can I use 1M tokens in the free tier? The free tier for Gemini API differs based on the model selected. For now, you can try the 1M token context window in the following ways: In Google AI Studio With pay-as-you-go plans With free-of-charge plans for select models See the latest free-of-charge rate limits per model on rate limits page . How can I calculate the number of tokens I'm using? Use the GenerativeModel.count_tokens method to count the number of tokens. Refer to the Tokens guide to learn more about tokens. Can I use my Google Cloud credits with the Gemini API? Yes, Google Cloud credits can be used towards Gemini API usage. How is billing handled? Billing for the Gemini API is handled by the Cloud Billing system. Am I charged for failed requests? If your request fails with a 400 or 500 error, you won't be charged for the tokens used. However, the request will still count against your quota. Is there a charge for fine-tuning the models? Model tuning is free, but inference on tuned models is charged at the same rate as the base models. Is GetTokens billed? Requests to the GetTokens API are not billed, and they don't count against inference quota. How is my Google AI Studio data handled if I have a paid API account? Refer to the terms for details on how data is handled when Cloud billing is enabled (see "How Google Uses Your Data" under "Paid Services"). Note that your Google AI Studio prompts are treated under the same "Paid Services" terms so long as at least 1 API project has billing enabled, which you can validate on the Gemini API Key page if you see any projects marked as "Paid" under "Plan". Where can I get help with billing? To get help with billing, see Get Cloud Billing support . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered \ No newline at end of file diff --git a/docstore/e789f77a-2d3c-4258-9d3f-16811efb56a0 b/docstore/e789f77a-2d3c-4258-9d3f-16811efb56a0 new file mode 100644 index 0000000000000000000000000000000000000000..a97f8e9b4e6ce1d85f8c4b9bed31a9676bb7c602 --- /dev/null +++ b/docstore/e789f77a-2d3c-4258-9d3f-16811efb56a0 @@ -0,0 +1 @@ +Files API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Files API The Gemini family of artificial intelligence (AI) models is built to handle various types of input data, including text, images, and audio. Since these models can handle more than one type or mode of data, the Gemini models are called multimodal models or explained as having multimodal capabilities . This guide shows you how to work with media files using the Files API. The basic operations are the same for audio files, images, videos, documents, and other supported file types. For file prompting guidance, check out the File prompt guide section. Upload a file You can use the Files API to upload a media file. Always use the Files API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20 MB. The following code uploads a file and then uses the file in a call to generateContent . Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp3" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ "Describe this audio clip" , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Describe this audio \ No newline at end of file diff --git a/docstore/e79daf22-4e8e-4411-a530-2b12630ea9c4 b/docstore/e79daf22-4e8e-4411-a530-2b12630ea9c4 new file mode 100644 index 0000000000000000000000000000000000000000..6cd03da77a7c04bb143fe9601905375d481c4c1f --- /dev/null +++ b/docstore/e79daf22-4e8e-4411-a530-2b12630ea9c4 @@ -0,0 +1 @@ +Gemini 2.0 Flash Preview Image Generation 2,000 3,000,000 100,000 Gemini 2.0 Flash-Lite 20,000 10,000,000 -- Imagen 4 Standard 10 -- 70 Imagen 4 Ultra 5 -- 30 Imagen 3 20 -- -- Veo 2 2 videos per minute -- 50 videos per day Gemini 1.5 Flash (Deprecated) 2,000 4,000,000 -- Gemini 1.5 Flash-8B (Deprecated) 4,000 4,000,000 -- Gemini 1.5 Pro (Deprecated) 1,000 4,000,000 -- Gemma 3 & 3n 30 15,000 14,400 Gemini Embedding Experimental 03-07 10 -- 1,000 Tier 3 Model RPM TPM RPD Gemini 2.5 Pro 2,000 8,000,000 -- Gemini 2.5 Flash 10,000 8,000,000 -- Gemini 2.5 Flash-Lite Preview 06-17 30,000 30,000,000 -- Gemini 2.5 Flash Preview TTS 1,000 1,000,000 -- Gemini 2.5 Pro Preview TTS 100 1,000,000 -- Gemini 2.0 Flash 30,000 30,000,000 -- Gemini 2.0 Flash Preview Image Generation 5,000 5,000,000 -- Gemini 2.0 Flash-Lite 30,000 30,000,000 -- Imagen 4 Standard 10 -- 70 Imagen 4 Ultra 5 -- 30 Imagen 3 20 -- -- Veo 2 2 videos per minute -- 50 videos per day Gemini 1.5 Flash (Deprecated) 2,000 4,000,000 -- Gemini 1.5 Flash-8B (Deprecated) 4,000 4,000,000 -- Gemini 1.5 Pro (Deprecated) 1,000 4,000,000 -- Gemma 3 & 3n 30 15,000 14,400 Gemini Embedding Experimental 03-07 10 -- 1,000 Specified rate limits are not guaranteed and actual capacity may vary. Live API rate limits The Live API processes continuous streams of audio, video, or text to deliver immediate, human-like spoken responses, creating a natural conversational experience for your users. This API has a different set of rate limits than the standard Gemini API calls. Free Tier Name Concurrent sessions TPM RPD Gemini 2.5 Flash Live 3 1,000,000 -- Gemini 2.0 Flash Live 3 1,000,000 -- Gemini 2.5 Flash Preview Native Audio Dialog 1 25,000 5 Gemini 2.5 Flash Experimental Native Audio Thinking Dialog 1 10,000 5 Tier 1 Name Concurrent sessions TPM RPD Gemini 2.5 Flash Live 50 4,000,000 -- Gemini 2.0 Flash Live 50 4,000,000 -- Gemini 2.5 Flash Preview Native Audio Dialog 3 50,000 50 Gemini 2.5 Flash Experimental Native Audio Thinking \ No newline at end of file diff --git a/docstore/e7a9bb93-981f-4c45-8a25-cfe0aa8e0aea b/docstore/e7a9bb93-981f-4c45-8a25-cfe0aa8e0aea new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/e7a9bb93-981f-4c45-8a25-cfe0aa8e0aea @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/e7c77877-008d-4a7f-b368-76b64ec7ebd3 b/docstore/e7c77877-008d-4a7f-b368-76b64ec7ebd3 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/e7c77877-008d-4a7f-b368-76b64ec7ebd3 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/e7e12882-bf8d-4cb3-a06f-0b212e2dd69b b/docstore/e7e12882-bf8d-4cb3-a06f-0b212e2dd69b new file mode 100644 index 0000000000000000000000000000000000000000..0d6f2bf32b918d3f85636fdb1c53070e3d5509f6 --- /dev/null +++ b/docstore/e7e12882-bf8d-4cb3-a06f-0b212e2dd69b @@ -0,0 +1 @@ +overlay_filename = f " { item [ 'label' ] } _ { i } _overlay.png" mask . save ( os . path . join ( output_dir , mask_filename )) # Create and save overlay composite = Image . alpha_composite ( im . convert ( 'RGBA' ), overlay ) composite . save ( os . path . join ( output_dir , overlay_filename )) print ( f "Saved mask and overlay for { item [ 'label' ] } to { output_dir } " ) # Example usage if __name__ == "__main__" : extract_segmentation_masks ( "path/to/image.png" ) Check the segmentation example in the cookbook guide for a more detailed example. An example segmentation output with objects and segmentation masks Supported image formats Gemini supports the following image format MIME types: PNG - image/png JPEG - image/jpeg WEBP - image/webp HEIC - image/heic HEIF - image/heif Capabilities All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation. Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements. Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities: Gemini 2.0 models are further trained to support enhanced object detection . Gemini 2.5 models are further trained to support enhanced segmentation in addition to object detection . Limitations and key technical information File limit Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request. Token calculation Gemini 1.5 Flash and Gemini 1.5 Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens. Gemini 2.0 Flash and Gemini 2.5 Flash/Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each \ No newline at end of file diff --git a/docstore/e7e70030-cb85-4391-a36d-91c40994423e b/docstore/e7e70030-cb85-4391-a36d-91c40994423e new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/e7e70030-cb85-4391-a36d-91c40994423e @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/e808aa60-1417-4842-bdef-0ccc29e30f46 b/docstore/e808aa60-1417-4842-bdef-0ccc29e30f46 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/e808aa60-1417-4842-bdef-0ccc29e30f46 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/e8099d52-c4c3-4769-a58f-d2717beb4443 b/docstore/e8099d52-c4c3-4769-a58f-d2717beb4443 new file mode 100644 index 0000000000000000000000000000000000000000..7645b864913317d4ec923e00d51796055880e22d --- /dev/null +++ b/docstore/e8099d52-c4c3-4769-a58f-d2717beb4443 @@ -0,0 +1 @@ +https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:batchGenerateContent \ -X POST \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type:application/json" \ -d "{ 'batch': { 'display_name': 'my-batch-requests', 'input_config': { 'requests': { 'file_name': ${ BATCH_INPUT_FILE } } } } }" When you create a batch job, you will get a job name returned. Use this name for monitoring the job status as well as retrieving the results once the job completes. The following is an example output that contains a job name: Created batch job from file: batches/123456789 Monitoring job status Use the operation name obtained when creating the batch job to poll its status. The state field of the batch job will indicate its current status. A batch job can be in one of the following states: JOB_STATE_PENDING : The job has been created and is waiting to be processed by the service. JOB_STATE_SUCCEEDED : The job completed successfully. You can now retrieve the results. JOB_STATE_FAILED : The job failed. Check the error details for more information. JOB_STATE_CANCELLED : The job was cancelled by the user. You can poll the job status periodically to check for completion. Python # Use the name of the job you want to check # e.g., inline_batch_job.name from the previous step job_name = "YOUR_BATCH_JOB_NAME" # (e.g. 'batches/your-batch-id') batch_job = client . batches . get ( name = job_name ) completed_states = set ([ 'JOB_STATE_SUCCEEDED' , 'JOB_STATE_FAILED' , 'JOB_STATE_CANCELLED' , ]) print ( f "Polling status for job: { job_name } " ) batch_job = client . batches . get ( name = job_name ) # Initial get while batch_job . state . name not in completed_states : print ( f "Current state: { batch_job . state . name } " ) time . sleep ( 30 ) # Wait for 30 seconds before polling again batch_job = client . batches . get ( name = job_name ) print ( f "Job finished with state: { batch_job . state . name } " ) if batch_job . state . name == 'JOB_STATE_FAILED' : print ( f \ No newline at end of file diff --git a/docstore/e80ffbca-c1e9-4250-b8f2-53bb7b766cd7 b/docstore/e80ffbca-c1e9-4250-b8f2-53bb7b766cd7 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/e80ffbca-c1e9-4250-b8f2-53bb7b766cd7 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/e8148b84-7288-414e-a13a-e18a40e5065f b/docstore/e8148b84-7288-414e-a13a-e18a40e5065f new file mode 100644 index 0000000000000000000000000000000000000000..b58c189db2b124c61872134bdd4b3d786a4e18e6 --- /dev/null +++ b/docstore/e8148b84-7288-414e-a13a-e18a40e5065f @@ -0,0 +1 @@ +you can read about configuring function calling . Python from google import genai from google.genai import types # Configure the client and tools client = genai . Client () house_tools = [ types . Tool ( function_declarations = [ power_disco_ball , start_music , dim_lights ]) ] config = types . GenerateContentConfig ( tools = house_tools , automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ), # Force the model to call 'any' function, instead of chatting. tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = 'ANY' ) ), ) chat = client . chats . create ( model = "gemini-2.5-flash" , config = config ) response = chat . send_message ( "Turn this place into a party!" ) # Print out each of the function calls requested from this single call print ( "Example 1: Forced function calling" ) for fn in response . function_calls : args = ", " . join ( f " { key } = { val } " for key , val in fn . args . items ()) print ( f " { fn . name } ( { args } )" ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Set up function declarations const houseFns = [ powerDiscoBall , startMusic , dimLights ]; const config = { tools : [{ functionDeclarations : houseFns }], // Force the model to call 'any' function, instead of chatting. toolConfig : { functionCallingConfig : { mode : 'any' } } }; // Configure the client const ai = new GoogleGenAI ({}); // Create a chat session const chat = ai . chats . create ({ model : 'gemini-2.5-flash' , config : config }); const response = await chat . sendMessage ({ message : 'Turn this place into a party!' }); // Print out each of the function calls requested from this single call console . log ( "Example 1: Forced function calling" ); for ( const fn of response . functionCalls ) { const args = Object . entries ( fn . args ) . map (([ key , val ]) = > ` ${ key } = ${ val } ` ) . join ( ', ' ); console . log ( ` ${ fn . name } ( ${ args } )` ); } Each of the printed \ No newline at end of file diff --git a/docstore/e84903cc-4efd-4b0b-b7ca-a97760ba5922 b/docstore/e84903cc-4efd-4b0b-b7ca-a97760ba5922 new file mode 100644 index 0000000000000000000000000000000000000000..8a34a1fe66a041005f53a5e081e09b0fa5f13242 --- /dev/null +++ b/docstore/e84903cc-4efd-4b0b-b7ca-a97760ba5922 @@ -0,0 +1 @@ +Grounding with Google Search | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Grounding with Google Search Grounding with Google Search connects the Gemini model to real-time web content and works with all available languages . This allows Gemini to provide more accurate answers and cite verifiable sources beyond its knowledge cutoff. Grounding helps you build applications that can: Increase factual accuracy: Reduce model hallucinations by basing responses on real-world information. Access real-time information: Answer questions about recent events and topics. Provide citations: Build user trust by showing the sources for the model's claims. Python from google import genai from google.genai import types # Configure the client client = genai . Client () # Define the grounding tool grounding_tool = types . Tool ( google_search = types . GoogleSearch () ) # Configure generation settings config = types . GenerateContentConfig ( tools = [ grounding_tool ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Who won the euro 2024?" , config = config , ) # Print the grounded response print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Define the grounding tool const groundingTool = { googleSearch : {}, }; // Configure generation settings const config = { tools : [ groundingTool ], }; // Make the request const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Who won the euro 2024?" , config , }); // Print the grounded response console . log ( response . \ No newline at end of file diff --git a/docstore/e84c0a2f-0e3c-4b02-9fbf-48eb63522fa2 b/docstore/e84c0a2f-0e3c-4b02-9fbf-48eb63522fa2 new file mode 100644 index 0000000000000000000000000000000000000000..acef50f8402c486c348013efe146de15b88cb32b --- /dev/null +++ b/docstore/e84c0a2f-0e3c-4b02-9fbf-48eb63522fa2 @@ -0,0 +1 @@ +Function calling with the Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Function calling with the Gemini API Function calling lets you connect models to external tools and APIs. Instead of generating text responses, the model determines when to call specific functions and provides the necessary parameters to execute real-world actions. This allows the model to act as a bridge between natural language and real-world actions and data. Function calling has 3 primary use cases: Augment Knowledge: Access information from external sources like databases, APIs, and knowledge bases. Extend Capabilities: Use external tools to perform computations and extend the limitations of the model, such as using a calculator or creating charts. Take Actions: Interact with external systems using APIs, such as scheduling appointments, creating invoices, sending emails, or controlling smart home devices. Get Weather Schedule Meeting Create Chart How function calling works Function calling involves a structured interaction between your application, the model, and external functions. Here's a breakdown of the process: Define Function Declaration: Define the function declaration in your application code. Function Declarations describe the function's name, parameters, and purpose to the model. Call LLM with function declarations: Send user prompt along with the function declaration(s) to the model. It analyzes the request and determines if a function call would be helpful. If so, it responds with a structured JSON object. Execute Function Code (Your Responsibility): The Model does not execute the function itself. It's your application's responsibility to \ No newline at end of file diff --git a/docstore/e8741c3e-c116-4da8-965b-87e3b84533ca b/docstore/e8741c3e-c116-4da8-965b-87e3b84533ca new file mode 100644 index 0000000000000000000000000000000000000000..d1b83172ef094b37793cd8d72611bb079685a2af --- /dev/null +++ b/docstore/e8741c3e-c116-4da8-965b-87e3b84533ca @@ -0,0 +1 @@ +with Google Search Not available Not available Used to improve our products Yes No Imagen 4 Preview Try it in Google AI Studio Our latest image generation model, with significantly better text rendering and better overall image quality. Preview models may change before becoming stable and have more restrictive rate limits. Free Tier Paid Tier, per Image in USD Imagen 4 Standard image price Not available $0.04 Imagen 4 Ultra image price Not available $0.06 Used to improve our products Yes No Imagen 3 Try it in Google AI Studio Our state-of-the-art image generation model, available to developers on the paid tier of the Gemini API. Free Tier Paid Tier, per Image in USD Image price Not available $0.03 Used to improve our products Yes No Veo 2 Try the API Our state-of-the-art video generation model, available to developers on the paid tier of the Gemini API. Free Tier Paid Tier, per second in USD Video price Not available $0.35 Used to improve our products Yes No Gemma 3 Try Gemma 3 Our lightweight, state-of the art, open model built from the same technology that powers our Gemini models. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge Not available Output price Free of charge Not available Context caching price Free of charge Not available Context caching (storage) Free of charge Not available Tuning price Not available Not available Grounding with Google Search Not available Not available Used to improve our products Yes No Gemma 3n Try Gemma 3n Our open model built for efficient performance on everyday devices like mobile phones, laptops, and tablets. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge Not available Output price Free of charge Not available Context caching price Free of charge Not available Context caching (storage) Free of charge Not available Tuning price Not available Not available Grounding with Google Search Not available Not available Used to improve our products Yes No Gemini 1.5 Flash Try it in Google AI \ No newline at end of file diff --git a/docstore/e87d7894-6ab9-4428-b119-a3e4db71847a b/docstore/e87d7894-6ab9-4428-b119-a3e4db71847a new file mode 100644 index 0000000000000000000000000000000000000000..9215b7f3aa3740b8a201f39a22b1e9c0be4ec501 --- /dev/null +++ b/docstore/e87d7894-6ab9-4428-b119-a3e4db71847a @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/available-regions Title: Available regions for Google AI Studio and Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/e881d626-156d-459c-8e7c-b9b847f4cf80 b/docstore/e881d626-156d-459c-8e7c-b9b847f4cf80 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/e881d626-156d-459c-8e7c-b9b847f4cf80 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/e8861b9d-54b1-4174-af03-3db27c7e1507 b/docstore/e8861b9d-54b1-4174-af03-3db27c7e1507 new file mode 100644 index 0000000000000000000000000000000000000000..ca0d44e9961a5c85cb8aeb7b443a141c9784fb0d --- /dev/null +++ b/docstore/e8861b9d-54b1-4174-af03-3db27c7e1507 @@ -0,0 +1 @@ +Image understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image understanding Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models. Tip: In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer improved accuracy for specific use cases like object detection and segmentation , through additional training. See the Capabilities section for more details. Passing images to Gemini You can provide images as input to Gemini using two methods: Passing inline image data : Ideal for smaller files (total request size less than 20MB, including prompts). Uploading images using the File API : Recommended for larger files or for reusing images across multiple requests. Passing inline image data You can pass inline image data in the request to generateContent . You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language). The following example shows how to read an image from a local file and pass it to generateContent API for processing. Python from google.genai import types with open ( 'path/to/small-sample.jpg' , 'rb' ) as f : image_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ types . Part . from_bytes ( data = image_bytes , mime_type = 'image/jpeg' , ), 'Caption this image.' ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs \ No newline at end of file diff --git a/docstore/e88d0c09-55be-4fa8-95ea-31d676c4cc66 b/docstore/e88d0c09-55be-4fa8-95ea-31d676c4cc66 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/e88d0c09-55be-4fa8-95ea-31d676c4cc66 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/e8b00846-6336-4511-824d-971fa7fe0f24 b/docstore/e8b00846-6336-4511-824d-971fa7fe0f24 new file mode 100644 index 0000000000000000000000000000000000000000..f2eb75e0bb8ad435829f4661abb79206447b0b03 --- /dev/null +++ b/docstore/e8b00846-6336-4511-824d-971fa7fe0f24 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/vision Title: Image understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/e8c5748c-7b78-4e1e-a25e-d3baec1dc2e1 b/docstore/e8c5748c-7b78-4e1e-a25e-d3baec1dc2e1 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/e8c5748c-7b78-4e1e-a25e-d3baec1dc2e1 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/e8ccdeda-30d4-4ce0-ba68-ccee16ebc8bf b/docstore/e8ccdeda-30d4-4ce0-ba68-ccee16ebc8bf new file mode 100644 index 0000000000000000000000000000000000000000..6acfe7ba02c229c83724a8d4179d85fae6727e02 --- /dev/null +++ b/docstore/e8ccdeda-30d4-4ce0-ba68-ccee16ebc8bf @@ -0,0 +1 @@ +images before performing its reasoning task. Prompt Model response What is in common between these images? All four images are examples of landscape photography. Updated prompt Improved response First, describe what's in each image in detail. What's in common between these images? The first image shows a person packing a suitcase. The second image shows an empty hallway in a shopping mall. The third image shows a roll of toilet paper. The fourth image shows a dark and stormy sea. The commonality between these images is that they are all examples of liminal spaces. Liminal spaces are places or situations that are in between two other places or situations. They are often characterized by a sense of uncertainty or ambiguity. The first image shows the person in between being at home and being on a trip. The second image shows the hallway in between two stores. The third image is of toilet paper, which can represent a bathroom. A bathroom is a transitional space between private and public realms. It's a place where we prepare ourselves to enter or leave the outside world. The fourth image shows the sea in between being calm and being stormy. Liminal spaces can be both exciting and unsettling. They can be a place of new beginning or a place of endings. They can be a place of reflection or a place of action. You can also try explicitly asking the model to refer to what's in the image. Updated prompt Improved response What is in common between these images? Refer to what's in the images in your response. All of the images are related to travel. The first image shows a person packing a suitcase, the second image shows an empty hallway in a shopping mall, the third image shows a stack of toilet paper, and the fourth image shows a dark and stormy sea. These images are all related to travel because they are all things that people need or use when they travel. The first image shows a person packing a suitcase, which is something that people do when they are going on a trip. The \ No newline at end of file diff --git a/docstore/e8d54042-8481-4c6c-a85f-346a86f0b828 b/docstore/e8d54042-8481-4c6c-a85f-346a86f0b828 new file mode 100644 index 0000000000000000000000000000000000000000..43d235dee43f472c269052714c9705874463b9cd --- /dev/null +++ b/docstore/e8d54042-8481-4c6c-a85f-346a86f0b828 @@ -0,0 +1 @@ +tools ]) # Define user prompt contents = [ types . Content ( role = "user" , parts = [ types . Part ( text = "Turn the lights down to a romantic level" )] ) ] # Send request with function declarations response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = contents config = config , ) print ( response . candidates [ 0 ] . content . parts [ 0 ] . function_call ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Generation config with function declaration const config = { tools : [{ functionDeclarations : [ setLightValuesFunctionDeclaration ] }] }; // Configure the client const ai = new GoogleGenAI ({}); // Define user prompt const contents = [ { role : 'user' , parts : [{ text : 'Turn the lights down to a romantic level' }] } ]; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( response . functionCalls [ 0 ]); The model then returns a functionCall object in an OpenAPI compatible schema specifying how to call one or more of the declared functions in order to respond to the user's question. Python id = None args = { 'color_temp' : 'warm' , 'brightness' : 25 } name = 'set_light_values' JavaScript { name : 'set_light_values' , args : { brightness : 25 , color_temp : 'warm' } } Step 3: Execute set_light_values function code Extract the function call details from the model's response, parse the arguments , and execute the set_light_values function. Python # Extract tool call details, it may not be in the first part. tool_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call if tool_call . name == "set_light_values" : result = set_light_values ( ** tool_call . args ) print ( f "Function execution result: { result } " ) JavaScript // Extract tool call details const tool_call = response . functionCalls [ 0 ] let result ; if ( tool_call . name === 'set_light_values' ) { result = \ No newline at end of file diff --git a/docstore/e8e1f659-73fc-4afd-a97c-dd904cba2613 b/docstore/e8e1f659-73fc-4afd-a97c-dd904cba2613 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/e8e1f659-73fc-4afd-a97c-dd904cba2613 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/e8e5d901-174d-4589-993b-6694474c2922 b/docstore/e8e5d901-174d-4589-993b-6694474c2922 new file mode 100644 index 0000000000000000000000000000000000000000..f768002e22e546af8fbd249f6201ab1a1006d078 --- /dev/null +++ b/docstore/e8e5d901-174d-4589-993b-6694474c2922 @@ -0,0 +1 @@ +const data = response . candidates ? .[ 0 ] ? . content ? . parts ? .[ 0 ] ? . inlineData ? . data ; const audioBuffer = Buffer . from ( data , 'base64' ); const fileName = 'out.wav' ; await saveWaveFile ( fileName , audioBuffer ); } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -X POST \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts":[{ "text": "Say cheerfully: Have a wonderful day!" }] }], "generationConfig": { "responseModalities": ["AUDIO"], "speechConfig": { "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Kore" } } } }, "model": "gemini-2.5-flash-preview-tts", }' | jq -r '.candidates[0].content.parts[0].inlineData.data' | \ base64 --decode >out.pcm # You may need to install ffmpeg. ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm out.wav Multi-speaker text-to-speech For multi-speaker audio, you'll need a MultiSpeakerVoiceConfig object with each speaker (up to 2) configured as a SpeakerVoiceConfig . You'll need to define each speaker with the same names used in the prompt : Python from google import genai from google.genai import types import wave # Set up the wave file to save the output: def wave_file ( filename , pcm , channels = 1 , rate = 24000 , sample_width = 2 ): with wave . open ( filename , "wb" ) as wf : wf . setnchannels ( channels ) wf . setsampwidth ( sample_width ) wf . setframerate ( rate ) wf . writeframes ( pcm ) client = genai . Client () prompt = """TTS the following conversation between Joe and Jane: Joe: How's it going today Jane? Jane: Not too bad, how about you?""" response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = prompt , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( multi_speaker_voice_config = types . MultiSpeakerVoiceConfig ( speaker_voice_configs = [ types . \ No newline at end of file diff --git a/docstore/e92b093a-d4d2-47f2-8536-0f6162c3032b b/docstore/e92b093a-d4d2-47f2-8536-0f6162c3032b new file mode 100644 index 0000000000000000000000000000000000000000..03e11f1b3b2cd84e15c6098d543ad30ece4e0a72 --- /dev/null +++ b/docstore/e92b093a-d4d2-47f2-8536-0f6162c3032b @@ -0,0 +1 @@ +Friday." }, ], response_format = CalendarEvent , ) print ( completion . choices [ 0 ] . message . parsed ) JavaScript import OpenAI from "openai" ; import { zodResponseFormat } from "openai/helpers/zod" ; import { z } from "zod" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai" }); const CalendarEvent = z . object ({ name : z . string (), date : z . string (), participants : z . array ( z . string ()), }); const completion = await openai . beta . chat . completions . parse ({ model : "gemini-2.0-flash" , messages : [ { role : "system" , content : "Extract the event information." }, { role : "user" , content : "John and Susan are going to an AI conference on Friday" }, ], response_format : zodResponseFormat ( CalendarEvent , "event" ), }); const event = completion . choices [ 0 ]. message . parsed ; console . log ( event ); Embeddings Text embeddings measure the relatedness of text strings and can be generated using the Gemini API . Python from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) response = client . embeddings . create ( input = "Your text string goes here" , model = "text-embedding-004" ) print ( response . data [ 0 ] . embedding ) JavaScript import OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function main () { const embedding = await openai . embeddings . create ({ model : "text-embedding-004" , input : "Your text string goes here" , }); console . log ( embedding ); } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/embeddings" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "input": "Your text string goes here", "model": "text-embedding-004" }' extra_body There are several features supported by Gemini that \ No newline at end of file diff --git a/docstore/e9303435-3b4f-4366-80d5-98beb1506a1e b/docstore/e9303435-3b4f-4366-80d5-98beb1506a1e new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/e9303435-3b4f-4366-80d5-98beb1506a1e @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/e93fa2c0-696c-41ac-87b3-217f453ad322 b/docstore/e93fa2c0-696c-41ac-87b3-217f453ad322 new file mode 100644 index 0000000000000000000000000000000000000000..84742c0b7906ca5a168857eb7577a7e191bcdffb --- /dev/null +++ b/docstore/e93fa2c0-696c-41ac-87b3-217f453ad322 @@ -0,0 +1 @@ +prompt - urban background, man-made structures, dark, stormy, or threatening atmosphere. Aspect ratios Gemini Veo video generation supports the following two aspect ratios: Aspect ratio Description Widescreen or 16:9 The most common aspect ratio for televisions, monitors, and mobile phone screens (landscape). Use this when you want to capture more of the background, like in scenic landscapes. Portrait or 9:16 Rotated widescreen. This aspect ratio has been popularized by short form video applications, such as Youtube shorts. Use this for portraits or tall objects with strong vertical orientations, such as buildings, trees, waterfall, or buildings. Widescreen This prompt is an example of the widescreen aspect ratio of 16:9. Prompt Generated output Create a video with a tracking drone view of a man driving a red convertible car in Palm Springs, 1970s, warm sunlight, long shadows. Portrait This prompt is an example of the portrait aspect ratio of 9:16. Prompt Generated output Create a video highlighting the smooth motion of a majestic Hawaiian waterfall within a lush rainforest. Focus on realistic water flow, detailed foliage, and natural lighting to convey tranquility. Capture the rushing water, misty atmosphere, and dappled sunlight filtering through the dense canopy. Use smooth, cinematic camera movements to showcase the waterfall and its surroundings. Aim for a peaceful, realistic tone, transporting the viewer to the serene beauty of the Hawaiian rainforest. What's next Gain more experience generating AI videos with the Veo Colab . Check out cool examples using Veo 2 on the Google DeepMind site Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/e9423c14-ba6c-41c9-9da2-1d3964277b58 b/docstore/e9423c14-ba6c-41c9-9da2-1d3964277b58 new file mode 100644 index 0000000000000000000000000000000000000000..2bc9ee1b64943d2fc9ee4b66d281a35e0e278a02 --- /dev/null +++ b/docstore/e9423c14-ba6c-41c9-9da2-1d3964277b58 @@ -0,0 +1 @@ +Session management with Live API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Session management with Live API In the Live API, a session refers to a persistent connection where input and output are streamed continuously over the same connection (read more about how it works ). This unique session design enables low latency and supports unique features, but can also introduce challenges, like session time limits, and early termination. This guide covers strategies for overcoming the session management challenges that can arise when using the Live API. Session lifetime Without compression, audio-only sessions are limited to 15 minutes, and audio-video sessions are limited to 2 minutes. Exceeding these limits will terminate the session (and therefore, the connection), but you can use context window compression to extend sessions to an unlimited amount of time. The lifetime of a connection is limited as well, to around 10 minutes. When the connection terminates, the session terminates as well. In this case, you can configure a single session to stay active over multiple connections using session resumption . You'll also receive a GoAway message before the connection ends, allowing you to take further actions. Context window compression To enable longer sessions, and avoid abrupt connection termination, you can enable context window compression by setting the contextWindowCompression field as part of the session configuration. In the ContextWindowCompressionConfig , you can configure a sliding-window mechanism and the number of tokens that triggers compression. Python from google.genai import types config = types . LiveConnectConfig ( \ No newline at end of file diff --git a/docstore/e9451881-59ac-488e-bc1c-acdd6d6cd577 b/docstore/e9451881-59ac-488e-bc1c-acdd6d6cd577 new file mode 100644 index 0000000000000000000000000000000000000000..da790334610df2eb1eb6ac9bb30354d4f1eafe8b --- /dev/null +++ b/docstore/e9451881-59ac-488e-bc1c-acdd6d6cd577 @@ -0,0 +1 @@ +the output you want. Adding these examples can help the model identify the patterns and apply the relationship between the given images and responses to the new example. This is also called "few-shot" learning. In the example below, the initial output is written in sentence form, and also contains the country (Brazil). Suppose you want the output in a different format or style, and you want only the city, not the country. Adding few-shot examples to your prompt can steer the model to respond in the way you want. Prompt Model response Determine the city along with the landmark. The landmark is the Christ the Redeemer statue in Rio de Janeiro, Brazil. Updated prompt Improved response Determine the city along with the landmark. city: Rome, landmark: the Colosseum. city: Beijing, landmark: Forbidden City city: Rio de Janeiro, landmark: Christ the Redeemer statue Break it down step-by-step For complex tasks like the ones that require both visual understanding and reasoning, it can be helpful to split the task into smaller, more straightforward steps. Alternatively, it could also be effective if you directly ask the model to “think step by step” in your prompt. Prompt Model response When will I run out of toilet paper? Soon, you only have 3 rolls left. Updated prompt Improved response 1. First, count how many toilet paper rolls are in this picture. 2. Then, determine how much toilet paper a typical person uses per day. 3. Calculate how long these rolls of toilet paper will last. 1. There are 3 rolls of toilet paper in this picture. 2. A typical person uses about 20 sheets of toilet paper per day. 3. If each roll contains 200 sheets, then each roll will last for about 10 days. Therefore, the 3 rolls will last for about a month. Math problems or other types of word problems are great candidates for asking the model to think step-by-step. Prompt Response What is the 4th term in the sequence? -135 The response from the model is incorrect. Some ways to improve this is to ask \ No newline at end of file diff --git a/docstore/e94cbbc2-4124-46ad-9924-3533a0e1d817 b/docstore/e94cbbc2-4124-46ad-9924-3533a0e1d817 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/e94cbbc2-4124-46ad-9924-3533a0e1d817 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/e9708955-3b90-467b-b236-3831b3077cc4 b/docstore/e9708955-3b90-467b-b236-3831b3077cc4 new file mode 100644 index 0000000000000000000000000000000000000000..34fafa88bef1190b729bdf255b8c99cfcd7b08b1 --- /dev/null +++ b/docstore/e9708955-3b90-467b-b236-3831b3077cc4 @@ -0,0 +1 @@ +Use descriptive language : Use adjectives and adverbs to paint a clear picture for Veo. Provide context : If necessary, include background information to help your model understand what you want. Reference specific artistic styles : If you have a particular aesthetic in mind, reference specific artistic styles or art movements. Utilize prompt engineering tools : Consider exploring prompt engineering tools or resources to help you refine your prompts and achieve optimal results. For more information, visit Introduction to prompt design . Enhance the facial details in your personal and group images : Specify facial details as a focus of the photo like using the word portrait in the prompt. Example prompts and output This section presents several prompts, highlighting how descriptive details can elevate the outcome of each video. Icicles This video demonstrates how you can use the elements of prompt writing basics in your prompt. Prompt Generated output Close up shot (composition) of melting icicles (subject) on a frozen rock wall (context) with cool blue tones (ambiance), zoomed in (camera motion) maintaining close-up detail of water drips (action). Man on the phone These videos demonstrate how you can revise your prompt with increasingly specific details to get Veo to refine the output to your liking. Prompt Generated output Analysis The camera dollies to show a close up of a desperate man in a green trench coat. He's making a call on a rotary-style wall phone with a green neon light. It looks like a movie scene. This is the first generated video based on the prompt. A close-up cinematic shot follows a desperate man in a weathered green trench coat as he dials a rotary phone mounted on a gritty brick wall, bathed in the eerie glow of a green neon sign. The camera dollies in, revealing the tension in his jaw and the desperation etched on his face as he struggles to make the call. The shallow depth of field focuses on his furrowed brow and the black rotary phone, \ No newline at end of file diff --git a/docstore/e99b2218-b7f6-4c47-a5ff-e1b9e02710d5 b/docstore/e99b2218-b7f6-4c47-a5ff-e1b9e02710d5 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/e99b2218-b7f6-4c47-a5ff-e1b9e02710d5 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/e9aba6ad-b886-424d-8e2e-a92c14e9d2d6 b/docstore/e9aba6ad-b886-424d-8e2e-a92c14e9d2d6 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/e9aba6ad-b886-424d-8e2e-a92c14e9d2d6 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/e9d11f89-b307-456c-a18e-52eb79650c01 b/docstore/e9d11f89-b307-456c-a18e-52eb79650c01 new file mode 100644 index 0000000000000000000000000000000000000000..4d5e24b23445eada240041ce046d4864e5df3992 --- /dev/null +++ b/docstore/e9d11f89-b307-456c-a18e-52eb79650c01 @@ -0,0 +1 @@ +anything with a lower probability is allowed. Threshold (Google AI Studio) Threshold (API) Description Block none BLOCK_NONE Always show regardless of probability of unsafe content Block few BLOCK_ONLY_HIGH Block when high probability of unsafe content Block some BLOCK_MEDIUM_AND_ABOVE Block when medium or high probability of unsafe content Block most BLOCK_LOW_AND_ABOVE Block when low, medium or high probability of unsafe content N/A HARM_BLOCK_THRESHOLD_UNSPECIFIED Threshold is unspecified, block using default threshold If the threshold is not set, the default block threshold is Block none (for gemini-1.5-pro-002 and gemini-1.5-flash-002 and all newer stable GA models) or Block some (in all other models) for all categories except the Civic integrity category. The default block threshold for the Civic integrity category is Block none (for gemini-2.0-flash-001 aliased as gemini-2.0-flash , gemini-2.0-pro-exp-02-05 , and gemini-2.0-flash-lite ) both for Google AI Studio and the Gemini API, and Block most for all other models in Google AI Studio only. You can set these settings for each request you make to the generative service. See the HarmBlockThreshold API reference for details. Safety feedback generateContent returns a GenerateContentResponse which includes safety feedback. Prompt feedback is included in promptFeedback . If promptFeedback.blockReason is set, then the content of the prompt was blocked. Response candidate feedback is included in Candidate.finishReason and Candidate.safetyRatings . If response content was blocked and the finishReason was SAFETY , you can inspect safetyRatings for more details. The content that was blocked is not returned. Adjust safety settings This section covers how to adjust the safety settings in both Google AI Studio and in your code. Google AI Studio You can adjust safety settings in Google AI Studio, but you cannot turn them off. Click Edit safety settings in the Run settings panel to open the Run safety settings modal. In \ No newline at end of file diff --git a/docstore/e9d3b443-9ada-43c3-a1df-cd96dfed7401 b/docstore/e9d3b443-9ada-43c3-a1df-cd96dfed7401 new file mode 100644 index 0000000000000000000000000000000000000000..43d235dee43f472c269052714c9705874463b9cd --- /dev/null +++ b/docstore/e9d3b443-9ada-43c3-a1df-cd96dfed7401 @@ -0,0 +1 @@ +tools ]) # Define user prompt contents = [ types . Content ( role = "user" , parts = [ types . Part ( text = "Turn the lights down to a romantic level" )] ) ] # Send request with function declarations response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = contents config = config , ) print ( response . candidates [ 0 ] . content . parts [ 0 ] . function_call ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Generation config with function declaration const config = { tools : [{ functionDeclarations : [ setLightValuesFunctionDeclaration ] }] }; // Configure the client const ai = new GoogleGenAI ({}); // Define user prompt const contents = [ { role : 'user' , parts : [{ text : 'Turn the lights down to a romantic level' }] } ]; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( response . functionCalls [ 0 ]); The model then returns a functionCall object in an OpenAPI compatible schema specifying how to call one or more of the declared functions in order to respond to the user's question. Python id = None args = { 'color_temp' : 'warm' , 'brightness' : 25 } name = 'set_light_values' JavaScript { name : 'set_light_values' , args : { brightness : 25 , color_temp : 'warm' } } Step 3: Execute set_light_values function code Extract the function call details from the model's response, parse the arguments , and execute the set_light_values function. Python # Extract tool call details, it may not be in the first part. tool_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call if tool_call . name == "set_light_values" : result = set_light_values ( ** tool_call . args ) print ( f "Function execution result: { result } " ) JavaScript // Extract tool call details const tool_call = response . functionCalls [ 0 ] let result ; if ( tool_call . name === 'set_light_values' ) { result = \ No newline at end of file diff --git a/docstore/e9e38699-f02d-4a84-9adc-bf4e9038ace3 b/docstore/e9e38699-f02d-4a84-9adc-bf4e9038ace3 new file mode 100644 index 0000000000000000000000000000000000000000..9bdceff10febb2c20e655c9c51c8df9e3aef858d --- /dev/null +++ b/docstore/e9e38699-f02d-4a84-9adc-bf4e9038ace3 @@ -0,0 +1 @@ +Using different words or phrasing in your prompts often yields different responses from the model even though they all mean the same thing. If you're not getting the expected results from your prompt, try rephrasing it. Version 1: How do I bake a pie? Version 2: Suggest a recipe for a pie. Version 3: What's a good pie recipe? Switch to an analogous task: If you can't get the model to follow your instructions for a task, try giving it instructions for an analogous task that achieves the same result. This prompt tells the model to categorize a book by using predefined categories: Prompt: Which category does The Odyssey belong to: thriller sci-fi mythology biography Response: The Odyssey belongs to the category of **mythology**. Here's why: * **Mythology:** The Odyssey tells the story of Odysseus, a hero from Greek mythology, and his journey home after the Trojan War. It features gods, monsters, and supernatural events common to Greek mythology. ..... (gemini-2.5-flash) The response is correct, but the model didn't stay within the bounds of the options. You also want to model to just respond with one of the options instead of in a full sentence. In this case, you can rephrase the instructions as a multiple choice question and ask the model to choose an option. Prompt: Multiple choice problem: Which of the following options describes the book The Odyssey? Options: thriller sci-fi mythology biography Response: The correct answer is mythology . (gemini-2.5-flash) Change the order of prompt content: The order of the content in the prompt can sometimes affect the response. Try changing the content order and see how that affects the response. Version 1 : [ examples ] [ context ] [ input ] Version 2 : [ input ] [ examples ] [ context ] Version 3 : [ examples ] [ input ] [ context ] Fallback responses A fallback response is a response returned by the model when either the prompt or the response triggers a safety filter. An example of a fallback response is "I'm not able to \ No newline at end of file diff --git a/docstore/e9ee0271-93e4-4241-9841-5015f31fc4d2 b/docstore/e9ee0271-93e4-4241-9841-5015f31fc4d2 new file mode 100644 index 0000000000000000000000000000000000000000..da790334610df2eb1eb6ac9bb30354d4f1eafe8b --- /dev/null +++ b/docstore/e9ee0271-93e4-4241-9841-5015f31fc4d2 @@ -0,0 +1 @@ +the output you want. Adding these examples can help the model identify the patterns and apply the relationship between the given images and responses to the new example. This is also called "few-shot" learning. In the example below, the initial output is written in sentence form, and also contains the country (Brazil). Suppose you want the output in a different format or style, and you want only the city, not the country. Adding few-shot examples to your prompt can steer the model to respond in the way you want. Prompt Model response Determine the city along with the landmark. The landmark is the Christ the Redeemer statue in Rio de Janeiro, Brazil. Updated prompt Improved response Determine the city along with the landmark. city: Rome, landmark: the Colosseum. city: Beijing, landmark: Forbidden City city: Rio de Janeiro, landmark: Christ the Redeemer statue Break it down step-by-step For complex tasks like the ones that require both visual understanding and reasoning, it can be helpful to split the task into smaller, more straightforward steps. Alternatively, it could also be effective if you directly ask the model to “think step by step” in your prompt. Prompt Model response When will I run out of toilet paper? Soon, you only have 3 rolls left. Updated prompt Improved response 1. First, count how many toilet paper rolls are in this picture. 2. Then, determine how much toilet paper a typical person uses per day. 3. Calculate how long these rolls of toilet paper will last. 1. There are 3 rolls of toilet paper in this picture. 2. A typical person uses about 20 sheets of toilet paper per day. 3. If each roll contains 200 sheets, then each roll will last for about 10 days. Therefore, the 3 rolls will last for about a month. Math problems or other types of word problems are great candidates for asking the model to think step-by-step. Prompt Response What is the 4th term in the sequence? -135 The response from the model is incorrect. Some ways to improve this is to ask \ No newline at end of file diff --git a/docstore/e9f664b3-0089-4c2e-af1c-21d7d2e0243c b/docstore/e9f664b3-0089-4c2e-af1c-21d7d2e0243c new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/e9f664b3-0089-4c2e-af1c-21d7d2e0243c @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/e9fae3d2-7e03-4679-aa44-ed3b8def30b5 b/docstore/e9fae3d2-7e03-4679-aa44-ed3b8def30b5 new file mode 100644 index 0000000000000000000000000000000000000000..b410c3d6fc95b8ad9abaf080d511bad548d2b4e0 --- /dev/null +++ b/docstore/e9fae3d2-7e03-4679-aa44-ed3b8def30b5 @@ -0,0 +1 @@ +get all 50." ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" , tools : [{ codeExecution : {} }], }); const result = await model . generateContent ( "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and make sure you get " + "all 50." , ); console . log ( result . response . text ()); After Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'What is the sum of the first 50 prime numbers? Generate and run ' 'code for the calculation, and make sure you get all 50.' , config = types . GenerateContentConfig ( tools = [ types . Tool ( code_execution = types . ToolCodeExecution )], ), ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContent ({ model : "gemini-2.0-pro-exp-02-05" , contents : `Write and execute code that calculates the sum of the first 50 prime numbers. Ensure that only the executable code and its resulting output are generated.` , }); // Each part may contain text, executable code, or an execution result. for ( const part of response . candidates [ 0 ]. content . parts ) { console . log ( part ); console . log ( "\n" ); } console . log ( "-" . repeat ( 80 )); // The `.text` accessor concatenates the parts into a markdown-formatted text. console . log ( "\n" , response . text ); Search grounding GoogleSearch (Gemini>=2.0) and GoogleSearchRetrieval (Gemini < 2.0) are tools that allow the model to retrieve public web data for grounding, powered by Google. Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ( contents = "what is the \ No newline at end of file diff --git a/docstore/e9fd8d13-d8d8-4123-ba4d-af1637fdd000 b/docstore/e9fd8d13-d8d8-4123-ba4d-af1637fdd000 new file mode 100644 index 0000000000000000000000000000000000000000..b0033fbc695240330e92f0eacef1e843c48482b9 --- /dev/null +++ b/docstore/e9fd8d13-d8d8-4123-ba4d-af1637fdd000 @@ -0,0 +1 @@ +Supported values are "16:9" and "9:16" . The default is "16:9" . personGeneration : Allow the model to generate videos of people. The following values are supported: Text-to-video generation: "dont_allow" : Don't allow the inclusion of people or faces. "allow_adult" : Generate videos that include adults, but not children. "allow_all" : Generate videos that include adults and children. Image-to-video generation: "dont_allow" : Don't allow the inclusion of people or faces. "allow_adult" : Generate videos that include adults, but not children. See Limitations . numberOfVideos : Output videos requested, either 1 or 2 . durationSeconds : Length of each output video in seconds, between 5 and 8 . enhance_prompt : Enable or disable the prompt rewriter. Enabled by default. Specifications Modalities Text-to-video generation Image-to-video generation Request latency Min: 11 seconds Max: 6 minutes (during peak hours) Variable length generation 5-8 seconds Resolution 720p Frame rate 24fps Aspect ratio 16:9 - landscape 9:16 - portrait Input languages (text-to-video) English Limitations Image-to-video personGeneration is not allowed in EU, UK, CH, MENA locations Text-to-video personGeneration: "allow_all" is not allowed in EU, UK, CH, MENA locations Note: Check out the Models , Pricing , and Rate limits pages for more usage limitations for Veo. Videos created by Veo are watermarked using SynthID , our tool for watermarking and identifying AI-generated content, and are passed through safety filters and memorization checking processes that help mitigate privacy, copyright and bias risks. Things to try To get the most out of Veo, incorporate video-specific terminology into your prompts. Veo understands a wide range of terms related to: Shot composition: Specify the framing and number of subjects in the shot (e.g., "single shot," "two shot," "over-the-shoulder shot"). Camera positioning and movement: Control the camera's location and movement using terms like "eye level," "high \ No newline at end of file diff --git a/docstore/ea074a88-203f-42ae-9b6a-79105217050c b/docstore/ea074a88-203f-42ae-9b6a-79105217050c new file mode 100644 index 0000000000000000000000000000000000000000..e83e80a217e9ff779ee42e75d11ac96b01a2f185 --- /dev/null +++ b/docstore/ea074a88-203f-42ae-9b6a-79105217050c @@ -0,0 +1 @@ +function_calling_config = types . FunctionCallingConfig ( mode = "ANY" , allowed_function_names = [ "get_current_temperature" ] ) ) # Create the generation config config = types . GenerateContentConfig ( tools = [ tools ], # not defined here. tool_config = tool_config , ) JavaScript import { FunctionCallingConfigMode } from '@google/genai' ; // Configure function calling mode const toolConfig = { functionCallingConfig : { mode : FunctionCallingConfigMode . ANY , allowedFunctionNames : [ 'get_current_temperature' ] } }; // Create the generation config const config = { tools : tools , // not defined here. toolConfig : toolConfig , }; Automatic function calling (Python only) When using the Python SDK, you can provide Python functions directly as tools. The SDK automatically converts the Python function to declarations, handles the function call execution and the response cycle for you. The Python SDK then automatically: Detects function call responses from the model. Call the corresponding Python function in your code. Sends the function response back to the model. Returns the model's final text response. To use this, define your function with type hints and a docstring, and then pass the function itself (not a JSON declaration) as a tool: Python from google import genai from google.genai import types # Define the function with type hints and docstring def get_current_temperature ( location : str ) - > dict : """Gets the current temperature for a given location. Args: location: The city and state, e.g. San Francisco, CA Returns: A dictionary containing the temperature and unit. """ # ... (implementation) ... return { "temperature" : 25 , "unit" : "Celsius" } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_current_temperature ] ) # Pass the function itself # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What's the temperature in Boston?" , config = config \ No newline at end of file diff --git a/docstore/ea08af02-fb00-4dd4-abef-64d3d6e34b7b b/docstore/ea08af02-fb00-4dd4-abef-64d3d6e34b7b new file mode 100644 index 0000000000000000000000000000000000000000..e83e80a217e9ff779ee42e75d11ac96b01a2f185 --- /dev/null +++ b/docstore/ea08af02-fb00-4dd4-abef-64d3d6e34b7b @@ -0,0 +1 @@ +function_calling_config = types . FunctionCallingConfig ( mode = "ANY" , allowed_function_names = [ "get_current_temperature" ] ) ) # Create the generation config config = types . GenerateContentConfig ( tools = [ tools ], # not defined here. tool_config = tool_config , ) JavaScript import { FunctionCallingConfigMode } from '@google/genai' ; // Configure function calling mode const toolConfig = { functionCallingConfig : { mode : FunctionCallingConfigMode . ANY , allowedFunctionNames : [ 'get_current_temperature' ] } }; // Create the generation config const config = { tools : tools , // not defined here. toolConfig : toolConfig , }; Automatic function calling (Python only) When using the Python SDK, you can provide Python functions directly as tools. The SDK automatically converts the Python function to declarations, handles the function call execution and the response cycle for you. The Python SDK then automatically: Detects function call responses from the model. Call the corresponding Python function in your code. Sends the function response back to the model. Returns the model's final text response. To use this, define your function with type hints and a docstring, and then pass the function itself (not a JSON declaration) as a tool: Python from google import genai from google.genai import types # Define the function with type hints and docstring def get_current_temperature ( location : str ) - > dict : """Gets the current temperature for a given location. Args: location: The city and state, e.g. San Francisco, CA Returns: A dictionary containing the temperature and unit. """ # ... (implementation) ... return { "temperature" : 25 , "unit" : "Celsius" } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_current_temperature ] ) # Pass the function itself # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What's the temperature in Boston?" , config = config \ No newline at end of file diff --git a/docstore/ea0a0771-fa7d-42f6-9185-cd7243addf66 b/docstore/ea0a0771-fa7d-42f6-9185-cd7243addf66 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/ea0a0771-fa7d-42f6-9185-cd7243addf66 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/ea1f6ee6-ff05-4479-8fc8-93df21ed3df7 b/docstore/ea1f6ee6-ff05-4479-8fc8-93df21ed3df7 new file mode 100644 index 0000000000000000000000000000000000000000..13654164ec6e7048af241ec2efbc514257c312b5 --- /dev/null +++ b/docstore/ea1f6ee6-ff05-4479-8fc8-93df21ed3df7 @@ -0,0 +1 @@ +For alternative methods of providing images and more advanced image processing, see our image understanding guide . The API also supports document , video , and audio inputs and understanding. Streaming responses By default, the model returns a response only after the entire generation process is complete. For more fluid interactions, use streaming to receive GenerateContentResponse instances incrementally as they're generated. Python from google import genai client = genai . Client () response = client . models . generate_content_stream ( model = "gemini-2.5-flash" , contents = [ "Explain how AI works" ] ) for chunk in response : print ( chunk . text , end = "" ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , }); for await ( const chunk of response ) { console . log ( chunk . text ); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } stream := client . Models . GenerateContentStream ( ctx , "gemini-2.5-flash" , genai . Text ( "Write a story about a magic backpack." ), nil , ) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt=sse" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ --no-buffer \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main \ No newline at end of file diff --git a/docstore/ea205832-a564-431b-ab05-ae147fdfb320 b/docstore/ea205832-a564-431b-ab05-ae147fdfb320 new file mode 100644 index 0000000000000000000000000000000000000000..2e8ac78d46cd543ec267658a40dc30e0a2feb077 --- /dev/null +++ b/docstore/ea205832-a564-431b-ab05-ae147fdfb320 @@ -0,0 +1 @@ +Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { SystemInstruction : genai . NewContentFromText ( "You are a cat. Your name is Neko." , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Hello there" ), config , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "system_instruction": { "parts": [ { "text": "You are a cat. Your name is Neko." } ] }, "contents": [ { "parts": [ { "text": "Hello there" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const systemInstruction = { parts : [{ text : 'You are a cat. Your name is Neko.' }] }; const payload = { systemInstruction , contents : [ { parts : [ { text : 'Hello there' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } The GenerateContentConfig object also lets you override default generation parameters, such as temperature . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model \ No newline at end of file diff --git a/docstore/ea2567a0-2ad4-43ec-bab3-6b8ac6803517 b/docstore/ea2567a0-2ad4-43ec-bab3-6b8ac6803517 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/ea2567a0-2ad4-43ec-bab3-6b8ac6803517 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/ea2621bb-ef03-45db-8207-23752897ca26 b/docstore/ea2621bb-ef03-45db-8207-23752897ca26 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/ea2621bb-ef03-45db-8207-23752897ca26 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/ea2d3ef5-8c2f-4bad-be4f-929fa8352ac4 b/docstore/ea2d3ef5-8c2f-4bad-be4f-929fa8352ac4 new file mode 100644 index 0000000000000000000000000000000000000000..203c1e44cd8be8f0144d13a11f43fb822930ab15 --- /dev/null +++ b/docstore/ea2d3ef5-8c2f-4bad-be4f-929fa8352ac4 @@ -0,0 +1 @@ +] // MCP Server }); const client = new Client ( { name : "example-client" , version : "1.0.0" } ); // Configure the client const ai = new GoogleGenAI ({}); // Initialize the connection between client and server await client . connect ( serverParams ); // Send request to the model with MCP tools const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : `What is the weather in London in ${ new Date (). toLocaleDateString () } ?` , config : { tools : [ mcpToTool ( client )], // uses the session, will automatically call the tool // Uncomment if you **don't** want the sdk to automatically call the tool // automaticFunctionCalling: { // disable: true, // }, }, }); console . log ( response . text ) // Close the connection await client . close (); Limitations with built-in MCP support Built-in MCP support is a experimental feature in our SDKs and has the following limitations: Only tools are supported, not resources nor prompts It is available for the Python and JavaScript/TypeScript SDK. Breaking changes might occur in future releases. Manual integration of MCP servers is always an option if these limit what you're building. Supported models This section lists models and their function calling capabilities. Experimental models are not included. You can find a comprehensive capabilities overview on the model overview page. Model Function Calling Parallel Function Calling Compositional Function Calling Gemini 2.5 Pro ✔️ ✔️ ✔️ Gemini 2.5 Flash ✔️ ✔️ ✔️ Gemini 2.5 Flash-Lite ✔️ ✔️ ✔️ Gemini 2.0 Flash ✔️ ✔️ ✔️ Gemini 2.0 Flash-Lite X X X Best practices Function and Parameter Descriptions: Be extremely clear and specific in your descriptions. The model relies on these to choose the correct function and provide appropriate arguments. Naming: Use descriptive function names (without spaces, periods, or dashes). Strong Typing: Use specific types (integer, string, enum) for parameters to reduce errors. If a parameter has a limited set of valid \ No newline at end of file diff --git a/docstore/ea4a16ed-988f-4b18-a43c-a8f180c552ed b/docstore/ea4a16ed-988f-4b18-a43c-a8f180c552ed new file mode 100644 index 0000000000000000000000000000000000000000..a99dcaa8b13f3b83504ad1b35c796f8fc297615c --- /dev/null +++ b/docstore/ea4a16ed-988f-4b18-a43c-a8f180c552ed @@ -0,0 +1 @@ +] # Execute the prompt with specified tools in audio modality await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Multiple tasks example - combining lights, code execution, and search const prompt = ` Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! ` ; const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } // not defined here. ]; // Execute the prompt with specified tools in audio modality await run ( prompt , { tools : tools , modality : "AUDIO" }); Python developers can try this out in the Live API Tool Use notebook . Model context protocol (MCP) Model Context Protocol (MCP) is an open standard for connecting AI applications with external tools and data. MCP provides a common protocol for models to access context, such as functions (tools), data sources (resources), or predefined prompts. The Gemini SDKs have built-in support for the MCP, reducing boilerplate code and offering automatic tool calling for MCP tools. When the model generates an MCP tool call, the Python and JavaScript client SDK can automatically execute the MCP tool and send the response back to the model in a subsequent request, continuing this loop until no more tool calls are made by the model. Here, you can find an example of how to use a local MCP server with Gemini and mcp SDK. Python Make sure the latest version of the mcp SDK is installed on your platform of choice. pip install mcp Note: Python supports automatic tool calling by passing in the ClientSession into the tools parameters. If you want to disable it, you can provide automatic_function_calling with disabled True . import os import asyncio from datetime import datetime from mcp import ClientSession , StdioServerParameters from \ No newline at end of file diff --git a/docstore/ea5ded06-7449-4de9-880c-e7bdcf82ea8e b/docstore/ea5ded06-7449-4de9-880c-e7bdcf82ea8e new file mode 100644 index 0000000000000000000000000000000000000000..34fafa88bef1190b729bdf255b8c99cfcd7b08b1 --- /dev/null +++ b/docstore/ea5ded06-7449-4de9-880c-e7bdcf82ea8e @@ -0,0 +1 @@ +Use descriptive language : Use adjectives and adverbs to paint a clear picture for Veo. Provide context : If necessary, include background information to help your model understand what you want. Reference specific artistic styles : If you have a particular aesthetic in mind, reference specific artistic styles or art movements. Utilize prompt engineering tools : Consider exploring prompt engineering tools or resources to help you refine your prompts and achieve optimal results. For more information, visit Introduction to prompt design . Enhance the facial details in your personal and group images : Specify facial details as a focus of the photo like using the word portrait in the prompt. Example prompts and output This section presents several prompts, highlighting how descriptive details can elevate the outcome of each video. Icicles This video demonstrates how you can use the elements of prompt writing basics in your prompt. Prompt Generated output Close up shot (composition) of melting icicles (subject) on a frozen rock wall (context) with cool blue tones (ambiance), zoomed in (camera motion) maintaining close-up detail of water drips (action). Man on the phone These videos demonstrate how you can revise your prompt with increasingly specific details to get Veo to refine the output to your liking. Prompt Generated output Analysis The camera dollies to show a close up of a desperate man in a green trench coat. He's making a call on a rotary-style wall phone with a green neon light. It looks like a movie scene. This is the first generated video based on the prompt. A close-up cinematic shot follows a desperate man in a weathered green trench coat as he dials a rotary phone mounted on a gritty brick wall, bathed in the eerie glow of a green neon sign. The camera dollies in, revealing the tension in his jaw and the desperation etched on his face as he struggles to make the call. The shallow depth of field focuses on his furrowed brow and the black rotary phone, \ No newline at end of file diff --git a/docstore/ea6f4a09-cce5-46af-876f-a9027c1a4d2f b/docstore/ea6f4a09-cce5-46af-876f-a9027c1a4d2f new file mode 100644 index 0000000000000000000000000000000000000000..c3dd210d4957ccdbb55df147cb99efb49a9932b2 --- /dev/null +++ b/docstore/ea6f4a09-cce5-46af-876f-a9027c1a4d2f @@ -0,0 +1 @@ +tokens Context caching price Not available $0.3125, prompts <= 128k tokens $0.625, prompts > 128k tokens Context caching (storage) Not available $4.50 per hour Tuning price Not available Not available Grounding with Google Search Not available $35 / 1K grounding requests Used to improve our products Yes No Text Embedding 004 Our state-of-the-art text embedding model. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge Not available Output price Free of charge Not available Tuning price Not available Not available Used to improve our products Yes No [*] Google AI Studio usage is free of charge in all available regions . See Billing FAQs for details. [**] Prices may differ from the prices listed here and the prices offered on Vertex AI. For Vertex prices, see the Vertex AI pricing page . [***] If you are using dynamic retrieval to optimize costs, only requests that contain at least one grounding support URL from the web in their response are charged for Grounding with Google Search. Costs for Gemini always apply. Rate limits are subject to change. Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/ea6facbb-32da-4f2c-a16f-cf6f1669bcc1 b/docstore/ea6facbb-32da-4f2c-a16f-cf6f1669bcc1 new file mode 100644 index 0000000000000000000000000000000000000000..d1b83172ef094b37793cd8d72611bb079685a2af --- /dev/null +++ b/docstore/ea6facbb-32da-4f2c-a16f-cf6f1669bcc1 @@ -0,0 +1 @@ +with Google Search Not available Not available Used to improve our products Yes No Imagen 4 Preview Try it in Google AI Studio Our latest image generation model, with significantly better text rendering and better overall image quality. Preview models may change before becoming stable and have more restrictive rate limits. Free Tier Paid Tier, per Image in USD Imagen 4 Standard image price Not available $0.04 Imagen 4 Ultra image price Not available $0.06 Used to improve our products Yes No Imagen 3 Try it in Google AI Studio Our state-of-the-art image generation model, available to developers on the paid tier of the Gemini API. Free Tier Paid Tier, per Image in USD Image price Not available $0.03 Used to improve our products Yes No Veo 2 Try the API Our state-of-the-art video generation model, available to developers on the paid tier of the Gemini API. Free Tier Paid Tier, per second in USD Video price Not available $0.35 Used to improve our products Yes No Gemma 3 Try Gemma 3 Our lightweight, state-of the art, open model built from the same technology that powers our Gemini models. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge Not available Output price Free of charge Not available Context caching price Free of charge Not available Context caching (storage) Free of charge Not available Tuning price Not available Not available Grounding with Google Search Not available Not available Used to improve our products Yes No Gemma 3n Try Gemma 3n Our open model built for efficient performance on everyday devices like mobile phones, laptops, and tablets. Free Tier Paid Tier, per 1M tokens in USD Input price Free of charge Not available Output price Free of charge Not available Context caching price Free of charge Not available Context caching (storage) Free of charge Not available Tuning price Not available Not available Grounding with Google Search Not available Not available Used to improve our products Yes No Gemini 1.5 Flash Try it in Google AI \ No newline at end of file diff --git a/docstore/ea9a591e-ffc5-4b37-8d22-9218f71120f6 b/docstore/ea9a591e-ffc5-4b37-8d22-9218f71120f6 new file mode 100644 index 0000000000000000000000000000000000000000..6e142f65f27405c6ffa9b3195699f63ab58a2f08 --- /dev/null +++ b/docstore/ea9a591e-ffc5-4b37-8d22-9218f71120f6 @@ -0,0 +1 @@ +happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) \ No newline at end of file diff --git a/docstore/ea9b9e43-a9e8-43a4-8ca3-7c9c1ed02f4e b/docstore/ea9b9e43-a9e8-43a4-8ca3-7c9c1ed02f4e new file mode 100644 index 0000000000000000000000000000000000000000..c6b31d98c2b16f02ca76b6157fd854c0b53727af --- /dev/null +++ b/docstore/ea9b9e43-a9e8-43a4-8ca3-7c9c1ed02f4e @@ -0,0 +1 @@ +: [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Thinking with Gemini 2.5 2.5 Flash and Pro models have "thinking" enabled by default to enhance quality, which may take longer to run and increase token usage. When using 2.5 Flash, you can disable thinking by setting the thinking budget to zero. For more details, see the thinking guide . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # Disables thinking ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , config : { thinkingConfig : { thinkingBudget : 0 , // Disables thinking }, } }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "How does AI work?" ), & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : int32 ( 0 ), // Disables thinking }, } ) \ No newline at end of file diff --git a/docstore/eaab9ea5-1e60-443f-b3cb-9ee19305a8a7 b/docstore/eaab9ea5-1e60-443f-b3cb-9ee19305a8a7 new file mode 100644 index 0000000000000000000000000000000000000000..c95ce8529f78ed9807c80ac97da2c9c530df9edf --- /dev/null +++ b/docstore/eaab9ea5-1e60-443f-b3cb-9ee19305a8a7 @@ -0,0 +1 @@ +GenerateContentConfig { ResponseModalities : [] string { "TEXT" , "IMAGE" }, } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash-preview-image-generation" , genai . Text ( "Hi, can you create a 3d rendered image of a pig " + "with wings and a top hat flying over a happy " + "futuristic scifi city with lots of greenery?" ), config , ) for _ , part := range result . Candidates [ 0 ]. Content . Parts { if part . Text != "" { fmt . Println ( part . Text ) } else if part . InlineData != nil { imageBytes := part . InlineData . Data outputFilename := "gemini_generated_image.png" _ = os . WriteFile ( outputFilename , imageBytes , 0644 ) } } } REST curl -s -X POST "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-preview-image-generation:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts": [ {"text": "Hi, can you create a 3d rendered image of a pig with wings and a top hat flying over a happy futuristic scifi city with lots of greenery?"} ] }], "generationConfig":{"responseModalities":["TEXT","IMAGE"]} }' \ | grep -o '"data": "[^"]*"' \ | cut -d '"' -f4 \ | base64 --decode > gemini-native-image.png AI-generated image of a fantastical flying pig Image editing (text-and-image-to-image) To perform image editing, add an image as input. The following example demonstrates uploading base64 encoded images. For multiple images and larger payloads, check the image input section. Python from google import genai from google.genai import types from PIL import Image from io import BytesIO import PIL.Image image = PIL . Image . open ( '/path/to/image.png' ) client = genai . Client () text_input = ( 'Hi, This is a picture of me.' 'Can you add a llama next to me?' ,) response = client . models . generate_content ( model = "gemini-2.0-flash-preview-image-generation" , contents = [ text_input , image ], config = types . GenerateContentConfig ( response_modalities = [ 'TEXT' \ No newline at end of file diff --git a/docstore/eab1714c-7e86-4985-bc71-18115964e8a9 b/docstore/eab1714c-7e86-4985-bc71-18115964e8a9 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/eab1714c-7e86-4985-bc71-18115964e8a9 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/eae83529-2020-4a0d-a898-f682ea60d5a2 b/docstore/eae83529-2020-4a0d-a898-f682ea60d5a2 new file mode 100644 index 0000000000000000000000000000000000000000..69f7399c35aaaad68e1bd1a996c44353577b3a79 --- /dev/null +++ b/docstore/eae83529-2020-4a0d-a898-f682ea60d5a2 @@ -0,0 +1 @@ +the user. if message . server_content and message . server_content . turn_complete : break if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } console . debug ( 'Connecting to the service with handle %s...' , previousSessionHandle ) const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : { responseModalities : [ Modality . TEXT ], sessionResumption : { handle : previousSessionHandle } // The handle of the session to resume is passed here, or else null to start a new session. } }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . sessionResumptionUpdate ) { if ( turn . sessionResumptionUpdate . resumable && turn . sessionResumptionUpdate . newHandle ) { let newHandle = turn . sessionResumptionUpdate . newHandle // ...Store newHandle and start new session with this handle here } } } session . close (); } async function main () { await \ No newline at end of file diff --git a/docstore/eb1aa431-eb43-444b-862b-f5c6e45387a9 b/docstore/eb1aa431-eb43-444b-862b-f5c6e45387a9 new file mode 100644 index 0000000000000000000000000000000000000000..3f35d7c2ee0452cbbcb055812399e279fb8f7031 --- /dev/null +++ b/docstore/eb1aa431-eb43-444b-862b-f5c6e45387a9 @@ -0,0 +1 @@ +$GEMINI_API_KEY " \ -X POST \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts":[{ "text": "TTS the following conversation between Joe and Jane: Joe: Hows it going today Jane? Jane: Not too bad, how about you?" }] }], "generationConfig": { "responseModalities": ["AUDIO"], "speechConfig": { "multiSpeakerVoiceConfig": { "speakerVoiceConfigs": [{ "speaker": "Joe", "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Kore" } } }, { "speaker": "Jane", "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Puck" } } }] } } }, "model": "gemini-2.5-flash-preview-tts", }' | jq -r '.candidates[0].content.parts[0].inlineData.data' | \ base64 --decode > out.pcm # You may need to install ffmpeg. ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm out.wav Controlling speech style with prompts You can control style, tone, accent, and pace using natural language prompts for both single- and multi-speaker TTS. For example, in a single-speaker prompt, you can say: Say in an spooky whisper: "By the pricking of my thumbs... Something wicked this way comes" In a multi-speaker prompt, provide the model with each speaker's name and corresponding transcript. You can also provide guidance for each speaker individually: Make Speaker1 sound tired and bored, and Speaker2 sound excited and happy: Speaker1: So... what's on the agenda today? Speaker2: You're never going to guess! Try using a voice option that corresponds to the style or emotion you want to convey, to emphasize it even more. In the previous prompt, for example, Enceladus 's breathiness might emphasize "tired" and "bored", while Puck 's upbeat tone could complement "excited" and "happy". Generating a prompt to convert to audio The TTS models only output audio, but you can use other models to generate a transcript first, then pass that transcript to the TTS model to read aloud. Python from google import genai from google.genai import types client = genai . Client () transcript = client . models . generate_content ( model \ No newline at end of file diff --git a/docstore/eb6aec55-d7aa-487d-a2de-709e3875be8e b/docstore/eb6aec55-d7aa-487d-a2de-709e3875be8e new file mode 100644 index 0000000000000000000000000000000000000000..7ad07eb45fff1ffd88928a8c1191c40c43412859 --- /dev/null +++ b/docstore/eb6aec55-d7aa-487d-a2de-709e3875be8e @@ -0,0 +1 @@ +public domain and does not show identifiable people. ( NASA image and media usage guidelines. ) The following code downloads the sample video, uploads it using the File API, waits for it to be processed, and then uses the file reference in a generateContent request. Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp4" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ myfile , "Summarize this video. Then create a quiz with an answer key based on the information in this video." ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp4" , config : { mimeType : "video/mp4" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Summarize this video. Then create a quiz with an answer key based on the information in this video." , ]), }); console . log ( response . text ); } await main (); Go uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.mp4" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Summarize this video. Then create a quiz with an answer key based on the information in this video." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST VIDEO_PATH = "path/to/sample.mp4" MIME_TYPE = $( file -b --mime-type " ${ VIDEO_PATH } " ) NUM_BYTES = $( wc -c < " ${ VIDEO_PATH } " ) DISPLAY_NAME = VIDEO tmp_header_file = upload-header.tmp echo "Starting file \ No newline at end of file diff --git a/docstore/eb9bfbc0-9042-49b5-87c6-b57b6ac2de1c b/docstore/eb9bfbc0-9042-49b5-87c6-b57b6ac2de1c new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/eb9bfbc0-9042-49b5-87c6-b57b6ac2de1c @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/ebbe01d5-6d8c-4605-92e9-ef3883c572b8 b/docstore/ebbe01d5-6d8c-4605-92e9-ef3883c572b8 new file mode 100644 index 0000000000000000000000000000000000000000..64b38d40afbaa776eeced04508049a0f469e337d --- /dev/null +++ b/docstore/ebbe01d5-6d8c-4605-92e9-ef3883c572b8 @@ -0,0 +1 @@ +OpenAI from "openai" ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function main () { const messages = [{ "role" : "user" , "content" : "What's the weather like in Chicago today?" }]; const tools = [ { "type" : "function" , "function" : { "name" : "get_weather" , "description" : "Get the weather in a given location" , "parameters" : { "type" : "object" , "properties" : { "location" : { "type" : "string" , "description" : "The city and state, e.g. Chicago, IL" , }, "unit" : { "type" : "string" , "enum" : [ "celsius" , "fahrenheit" ]}, }, "required" : [ "location" ], }, } } ]; const response = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : messages , tools : tools , tool_choice : "auto" , }); console . log ( response ); } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d '{ "model": "gemini-2.0-flash", "messages": [ { "role": "user", "content": "What' \' 's the weather like in Chicago today?" } ], "tools": [ { "type": "function", "function": { "name": "get_weather", "description": "Get the current weather in a given location", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "The city and state, e.g. Chicago, IL" }, "unit": { "type": "string", "enum": ["celsius", "fahrenheit"] } }, "required": ["location"] } } } ], "tool_choice": "auto" }' Image understanding Gemini models are natively multimodal and provide best in class performance on many common vision tasks . Python import base64 from openai import OpenAI client = OpenAI ( api_key = "GEMINI_API_KEY" , base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" ) # Function to encode the image def encode_image ( image_path ): with open ( image_path , "rb" ) as image_file : return base64 . b64encode ( \ No newline at end of file diff --git a/docstore/ebeec426-5f5f-4ebf-87ae-66ad263ebe65 b/docstore/ebeec426-5f5f-4ebf-87ae-66ad263ebe65 new file mode 100644 index 0000000000000000000000000000000000000000..870bdf233e4121d87fb93e9ca945d2d996edf2dd --- /dev/null +++ b/docstore/ebeec426-5f5f-4ebf-87ae-66ad263ebe65 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting-intro#completion Title: Prompt design strategies | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/ec1e6938-b9ec-4bc9-9fba-b53a03b54c90 b/docstore/ec1e6938-b9ec-4bc9-9fba-b53a03b54c90 new file mode 100644 index 0000000000000000000000000000000000000000..f1254740842b565388da4397f822e545092a5b88 --- /dev/null +++ b/docstore/ec1e6938-b9ec-4bc9-9fba-b53a03b54c90 @@ -0,0 +1 @@ +NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) stream := chat . SendMessageStream ( ctx , genai . Part { Text : "How many paws are in my house?" }) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt = sse \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Supported models All models in the Gemini family support text generation. To learn more about the models \ No newline at end of file diff --git a/docstore/ec3fda54-2623-4584-a764-dc28027cc6cb b/docstore/ec3fda54-2623-4584-a764-dc28027cc6cb new file mode 100644 index 0000000000000000000000000000000000000000..805d10be613a3c55006267c8f5d9ce17fbca450d --- /dev/null +++ b/docstore/ec3fda54-2623-4584-a764-dc28027cc6cb @@ -0,0 +1 @@ +declaration can include the following parameters: name (string): A unique name for the function ( get_weather_forecast , send_email ). Use descriptive names without spaces or special characters (use underscores or camelCase). description (string): A clear and detailed explanation of the function's purpose and capabilities. This is crucial for the model to understand when to use the function. Be specific and provide examples if helpful ("Finds theaters based on location and optionally movie title which is currently playing in theaters."). parameters (object): Defines the input parameters the function expects. type (string): Specifies the overall data type, such as object . properties (object): Lists individual parameters, each with: type (string): The data type of the parameter, such as string , integer , boolean, array . description (string): A description of the parameter's purpose and format. Provide examples and constraints ("The city and state, e.g., 'San Francisco, CA' or a zip code e.g., '95616'."). enum (array, optional): If the parameter values are from a fixed set, use "enum" to list the allowed values instead of just describing them in the description. This improves accuracy ("enum": ["daylight", "cool", "warm"]). required (array): An array of strings listing the parameter names that are mandatory for the function to operate. Function calling with thinking Enabling "thinking" can improve function call performance by allowing the model to reason through a request before suggesting function calls. However, because the Gemini API is stateless, this reasoning context is lost between turns, which can reduce the quality of function calls as they require multiple turn requests. To preserve this context you can use thought signatures. A thought signature is an encrypted representation of the model's internal thought process that you pass back to the model on subsequent turns. To use thought signatures: Receive the signature: When thinking is enabled, the API \ No newline at end of file diff --git a/docstore/ec46f923-030d-41e4-965b-509834a6855f b/docstore/ec46f923-030d-41e4-965b-509834a6855f new file mode 100644 index 0000000000000000000000000000000000000000..5ec9ece3755d9588328841497099189b5d8c6b8d --- /dev/null +++ b/docstore/ec46f923-030d-41e4-965b-509834a6855f @@ -0,0 +1 @@ +5. Forever Blooms Task Give me a simple list of just the things that I must bring on a camping trip. The list should have 5 items. 1. **Tent:** Your shelter from the elements. 2. **Sleeping Bag:** Stay warm and comfortable. 3. **Sleeping Pad:** Insulation and cushioning for a good night's sleep. 4. **Headlamp or Flashlight:** Light for nighttime activities. 5. **First Aid Kit:** Be prepared for minor injuries. Entity Classify the following items as [large, small]: Elephant Mouse Snail Here's the classification: * **Elephant:** Large * **Mouse:** Small * **Snail:** Small Partial input completion Generative language models work like an advanced auto completion tool. When you provide partial content, the model can provide the rest of the content or what it thinks is a continuation of that content as a response. When doing so, if you include any examples or context, the model can take those examples or context into account. The following example provides a prompt with an instruction and an entity input: Prompt: For the given order, return a JSON object that has the fields cheeseburger, hamburger, fries, or drink, with the value being the quantity. Order: A burger and a drink. Response: { "cheeseburger": 0, "hamburger": 1, "fries": 0, "drink": 1 } (gemini-2.5-flash) While the model did as prompted, writing out the instructions in natural language can sometimes be challenging and it leaves a lot to the model's interpretation. For example, a restaurants menu might contain many items. To reduce the size of the JSON response, you probably want to omit the items that weren't ordered. In this case, you can give an example and a response prefix and let the model complete it: Prompt: Valid fields are cheeseburger, hamburger, fries, and drink. Order: Give me a cheeseburger and fries Output: ``` { "cheeseburger": 1, "fries": 1 } ``` Order: I want two burgers, a drink, and fries. Output: Response: ``` { "hamburger": 2, "drink": 1, "fries": 1 } ``` (gemini-2.5-flash) Notice how \ No newline at end of file diff --git a/docstore/ec5b92e0-86b4-4162-a371-b568d5a51fb9 b/docstore/ec5b92e0-86b4-4162-a371-b568d5a51fb9 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/ec5b92e0-86b4-4162-a371-b568d5a51fb9 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/ec6ee4a1-3d6c-4092-ac56-d29c5c5c2382 b/docstore/ec6ee4a1-3d6c-4092-ac56-d29c5c5c2382 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/ec6ee4a1-3d6c-4092-ac56-d29c5c5c2382 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/ec72c63d-ee5d-4b16-8619-eecbef337a5d b/docstore/ec72c63d-ee5d-4b16-8619-eecbef337a5d new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/ec72c63d-ee5d-4b16-8619-eecbef337a5d @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/ec794630-28ea-41c4-a5b3-050c5386bbd3 b/docstore/ec794630-28ea-41c4-a5b3-050c5386bbd3 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/ec794630-28ea-41c4-a5b3-050c5386bbd3 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/ec81dcce-1cde-4216-9862-ea524c22353e b/docstore/ec81dcce-1cde-4216-9862-ea524c22353e new file mode 100644 index 0000000000000000000000000000000000000000..5a2b8a2c7253b87796a50aae4616794fa40b7018 --- /dev/null +++ b/docstore/ec81dcce-1cde-4216-9862-ea524c22353e @@ -0,0 +1 @@ +( 'Opened' ); }, onmessage : function ( message ) { console . debug ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send content... session . close (); } main (); Note: You can only set one modality in the response_modalities field. This means that you can configure the model to respond with either text or audio, but not both in the same session. Interaction modalities The following sections provide examples and supporting context for the different input and output modalities available in Live API. Sending and receiving text Here's how you can send and receive text: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello, how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . text is not None : print ( response . text , end = "" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . \ No newline at end of file diff --git a/docstore/ec85848e-1ff5-4f17-a5c3-26cfaa4ca737 b/docstore/ec85848e-1ff5-4f17-a5c3-26cfaa4ca737 new file mode 100644 index 0000000000000000000000000000000000000000..70023e78d7b8c9459310371555beec5ba6328e28 --- /dev/null +++ b/docstore/ec85848e-1ff5-4f17-a5c3-26cfaa4ca737 @@ -0,0 +1 @@ +results reflects a single function call that the model has requested. To send the results back, include the responses in the same order as they were requested. The Python SDK supports automatic function calling , which automatically converts Python functions to declarations, handles the function call execution and response cycle for you. Following is an example for the disco use case. Note: Automatic Function Calling is a Python SDK only feature at the moment. Python from google import genai from google.genai import types # Actual function implementations def power_disco_ball_impl ( power : bool ) - > dict : """Powers the spinning disco ball. Args: power: Whether to turn the disco ball on or off. Returns: A status dictionary indicating the current state. """ return { "status" : f "Disco ball powered { 'on' if power else 'off' } " } def start_music_impl ( energetic : bool , loud : bool ) - > dict : """Play some music matching the specified parameters. Args: energetic: Whether the music is energetic or not. loud: Whether the music is loud or not. Returns: A dictionary containing the music settings. """ music_type = "energetic" if energetic else "chill" volume = "loud" if loud else "quiet" return { "music_type" : music_type , "volume" : volume } def dim_lights_impl ( brightness : float ) - > dict : """Dim the lights. Args: brightness: The brightness of the lights, 0.0 is off, 1.0 is full. Returns: A dictionary containing the new brightness setting. """ return { "brightness" : brightness } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ power_disco_ball_impl , start_music_impl , dim_lights_impl ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Do everything you need to this place into party!" , config = config , ) print ( " \n Example 2: Automatic function calling" ) print ( response . text ) # I've turned on the disco ball, started playing loud and \ No newline at end of file diff --git a/docstore/ec8797f8-a99d-423b-bd92-0229be13b231 b/docstore/ec8797f8-a99d-423b-bd92-0229be13b231 new file mode 100644 index 0000000000000000000000000000000000000000..d028e190ded4732dff341fe647fe7532c210272b --- /dev/null +++ b/docstore/ec8797f8-a99d-423b-bd92-0229be13b231 @@ -0,0 +1 @@ +second image shows an empty hallway in a shopping mall, which is a place where people often go to buy things for their trips. The third image shows a stack of toilet paper, which is something that people often use when they are traveling. The fourth image shows a dark and stormy sea, which is something that people often encounter when they are traveling. The same tips work for creative output from images. In the following example with the rabbit, explicitly asking the model to include aspects from the image can improve the output. Prompt Model response Can you write me a poem about this image? Little bunny, so soft and brown, Hopping through the field, up and down. Your ears so long, your tail so small, You're the cutest creature of them all. You nibble on the grass, so green and sweet, And hop away when you see me. But I'll always be your friend, Little bunny, until the very end. Updated prompt Improved response Can you write me a descriptive and dramatic poem about this image and include the location? Little bunny on the green, Ears so long, so keen. What do you see out there, In the distance, where you stare? Is it the ocean blue, Sparkling in the sun's hue? Or the waves that crash and roar, A rhythm that you adore? Little bunny, tell me true, What captures your attention so? Is it the salty breeze, Or the seagulls' cries that please? Perhaps it's the vastness of it all, A world so big and small. A place where dreams can roam, And adventures can be sown. Little bunny, I wonder what you dream, As you sit on the grass, so serene. Do you yearn to explore the deep, Or stay on land, where you can leap? Whatever it may be, little bunny, Keep that spark of wonder burning brightly. For in your dreams and aspirations, Lies a world waiting for your creation. Troubleshooting which part of the prompt failed It can be hard to know whether a prompt failed because the model didn't understand the image to begin with, or if it did understand the image but did not perform the \ No newline at end of file diff --git a/docstore/ec88065e-ea14-493f-8f0d-1d7424fff8c7 b/docstore/ec88065e-ea14-493f-8f0d-1d7424fff8c7 new file mode 100644 index 0000000000000000000000000000000000000000..e7df3841ca47d093238d5ecfeb9f7cc95942f8e5 --- /dev/null +++ b/docstore/ec88065e-ea14-493f-8f0d-1d7424fff8c7 @@ -0,0 +1 @@ +data image2Path := "path/to/image2.jpeg" imgBytes , _ := os . ReadFile ( image2Path ) parts := [] * genai . Part { genai . NewPartFromText ( "What is different between these two images?" ), genai . NewPartFromBytes ( imgBytes , "image/jpeg" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST # Upload the first image IMAGE1_PATH = "path/to/image1.jpg" MIME1_TYPE = $( file -b --mime-type " ${ IMAGE1_PATH } " ) NUM1_BYTES = $( wc -c < " ${ IMAGE1_PATH } " ) DISPLAY_NAME1 = IMAGE1 tmp_header_file1 = upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME1_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME1 } '}}" 2 > /dev/null upload_url1 = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file1 } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file1 } " curl " ${ upload_url1 } " \ -H "Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE1_PATH } " 2 > /dev/null > file_info1.json file1_uri = $( jq ".file.uri" file_info1.json ) echo file1_uri = $file1_uri # Prepare the second image (inline) IMAGE2_PATH = "path/to/image2.png" MIME2_TYPE = $( file -b --mime-type " ${ IMAGE2_PATH } " ) if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMAGE2_BASE64 = $( base64 $B64FLAGS $IMAGE2_PATH ) # Now generate content using both images curl \ No newline at end of file diff --git a/docstore/ec8a26d2-eb69-47b9-8a32-f8022d85f10e b/docstore/ec8a26d2-eb69-47b9-8a32-f8022d85f10e new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/ec8a26d2-eb69-47b9-8a32-f8022d85f10e @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/ec9e2021-7efe-4442-abcb-f500253c8c9b b/docstore/ec9e2021-7efe-4442-abcb-f500253c8c9b new file mode 100644 index 0000000000000000000000000000000000000000..3c1b7a7f28e24faa230d65b7608a5733d7c64407 --- /dev/null +++ b/docstore/ec9e2021-7efe-4442-abcb-f500253c8c9b @@ -0,0 +1 @@ +single prompt by including multiple image Part objects in the contents array. These can be a mix of inline data (local files or URLs) and File API references. Python from google import genai from google.genai import types client = genai . Client () # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client . files . upload ( file = image1_path ) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open ( image2_path , 'rb' ) as f : img2_bytes = f . read () # Create the prompt with text and multiple images response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is different between these two images?" , uploaded_file , # Use the uploaded file reference types . Part . from_bytes ( data = img2_bytes , mime_type = 'image/png' ) ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); async function main () { // Upload the first image const image1_path = "path/to/image1.jpg" ; const uploadedFile = await ai . files . upload ({ file : image1_path , config : { mimeType : "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png" ; const base64Image2File = fs . readFileSync ( image2_path , { encoding : "base64" , }); // Create the prompt with text and multiple images const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ "What is different between these two images?" , createPartFromUri ( uploadedFile . uri , uploadedFile . mimeType ), { inlineData : { mimeType : "image/png" , data : base64Image2File , }, }, ]), }); console . log ( response . text ); } await main (); Go // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile , _ := client . Files . UploadFromPath ( ctx , image1Path , nil ) // Prepare the second image as inline \ No newline at end of file diff --git a/docstore/eca437a1-0317-45d6-b9de-a0c8edc1e7cb b/docstore/eca437a1-0317-45d6-b9de-a0c8edc1e7cb new file mode 100644 index 0000000000000000000000000000000000000000..b58c189db2b124c61872134bdd4b3d786a4e18e6 --- /dev/null +++ b/docstore/eca437a1-0317-45d6-b9de-a0c8edc1e7cb @@ -0,0 +1 @@ +you can read about configuring function calling . Python from google import genai from google.genai import types # Configure the client and tools client = genai . Client () house_tools = [ types . Tool ( function_declarations = [ power_disco_ball , start_music , dim_lights ]) ] config = types . GenerateContentConfig ( tools = house_tools , automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ), # Force the model to call 'any' function, instead of chatting. tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = 'ANY' ) ), ) chat = client . chats . create ( model = "gemini-2.5-flash" , config = config ) response = chat . send_message ( "Turn this place into a party!" ) # Print out each of the function calls requested from this single call print ( "Example 1: Forced function calling" ) for fn in response . function_calls : args = ", " . join ( f " { key } = { val } " for key , val in fn . args . items ()) print ( f " { fn . name } ( { args } )" ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Set up function declarations const houseFns = [ powerDiscoBall , startMusic , dimLights ]; const config = { tools : [{ functionDeclarations : houseFns }], // Force the model to call 'any' function, instead of chatting. toolConfig : { functionCallingConfig : { mode : 'any' } } }; // Configure the client const ai = new GoogleGenAI ({}); // Create a chat session const chat = ai . chats . create ({ model : 'gemini-2.5-flash' , config : config }); const response = await chat . sendMessage ({ message : 'Turn this place into a party!' }); // Print out each of the function calls requested from this single call console . log ( "Example 1: Forced function calling" ); for ( const fn of response . functionCalls ) { const args = Object . entries ( fn . args ) . map (([ key , val ]) = > ` ${ key } = ${ val } ` ) . join ( ', ' ); console . log ( ` ${ fn . name } ( ${ args } )` ); } Each of the printed \ No newline at end of file diff --git a/docstore/ecacb8f5-ef5c-458b-96ab-560b4376b0d6 b/docstore/ecacb8f5-ef5c-458b-96ab-560b4376b0d6 new file mode 100644 index 0000000000000000000000000000000000000000..951343b7e154dfc2bc312960fc158259d984c283 --- /dev/null +++ b/docstore/ecacb8f5-ef5c-458b-96ab-560b4376b0d6 @@ -0,0 +1 @@ +client . aio . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'Tell me a story in 300 words.' ) Chat Start a chat and send a message to the model: Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) chat = model . start_chat () response = chat . send_message ( "Tell me a story in 100 words" ) response = chat . send_message ( "What happened after that?" ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); const chat = model . startChat ({ history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); let result = await chat . sendMessage ( "I have 2 dogs in my house." ); console . log ( result . response . text ()); result = await chat . sendMessage ( "How many paws are in my house?" ); console . log ( result . response . text ()); Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( "gemini-1.5-flash" ) cs := model . StartChat () cs . History = [] * genai . Content { { Parts : [] genai . Part { genai . Text ( "Hello, I have 2 dogs in my house." ), }, Role : "user" , }, { Parts : [] genai . Part { genai . Text ( "Great to meet you. What would you like to know?" ), }, Role : "model" , }, } res , err := cs . SendMessage ( ctx , genai . Text ( "How many paws are in my house?" )) if err != nil { log . Fatal ( err ) } printResponse ( res ) // utility for printing the response After Python from google import genai client = genai . Client () chat = client . chats . create ( model = 'gemini-2.0-flash' ) response = chat . send_message ( message = 'Tell me a story in 100 words' ) response = \ No newline at end of file diff --git a/docstore/ecc33b8b-0cc1-4cea-a047-a3b106e16cb0 b/docstore/ecc33b8b-0cc1-4cea-a047-a3b106e16cb0 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/ecc33b8b-0cc1-4cea-a047-a3b106e16cb0 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/ecc6ac83-c78f-4a91-be01-425fe4fbfec0 b/docstore/ecc6ac83-c78f-4a91-be01-425fe4fbfec0 new file mode 100644 index 0000000000000000000000000000000000000000..aacc2cdd644f4375a683fabb35c29fb0962b3a72 --- /dev/null +++ b/docstore/ecc6ac83-c78f-4a91-be01-425fe4fbfec0 @@ -0,0 +1 @@ +context include: Video question and answering Video memory, as shown with Google's Project Astra Video captioning Video recommendation systems, by enriching existing metadata with new multimodal understanding Video customization, by looking at a corpus of data and associated video metadata and then removing parts of videos that are not relevant to the viewer Video content moderation Real-time video processing When working with videos, it is important to consider how the videos are processed into tokens , which affects billing and usage limits. You can learn more about prompting with video files in the Prompting guide . Long form audio The Gemini models were the first natively multimodal large language models that could understand audio. Historically, the typical developer workflow would involve stringing together multiple domain specific models, like a speech-to-text model and a text-to-text model, in order to process audio. This led to additional latency required by performing multiple round-trip requests and decreased performance usually attributed to disconnected architectures of the multiple model setup. Some emerging and standard use cases for audio context include: Real-time transcription and translation Podcast / video question and answering Meeting transcription and summarization Voice assistants You can learn more about prompting with audio files in the Prompting guide . Long context optimizations The primary optimization when working with long context and the Gemini models is to use context caching . Beyond the previous impossibility of processing lots of tokens in a single request, the other main constraint was the cost. If you have a "chat with your data" app where a user uploads 10 PDFs, a video, and some work documents, you would historically have to work with a more complex retrieval augmented generation (RAG) tool / framework in order to process these requests and pay a significant amount for tokens moved into the context window. Now, you can cache \ No newline at end of file diff --git a/docstore/ece39269-1370-4d9f-a475-39ddb4e51896 b/docstore/ece39269-1370-4d9f-a475-39ddb4e51896 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/ece39269-1370-4d9f-a475-39ddb4e51896 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/ed04e9f1-b61e-4aba-9209-a7a7635195c9 b/docstore/ed04e9f1-b61e-4aba-9209-a7a7635195c9 new file mode 100644 index 0000000000000000000000000000000000000000..76ace95fe58e8b11b8d859b0314962fe72bedd09 --- /dev/null +++ b/docstore/ed04e9f1-b61e-4aba-9209-a7a7635195c9 @@ -0,0 +1 @@ +() { const payload = { contents : [ { parts : [ { text : 'Explain how AI works' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Multi-turn conversations (Chat) Our SDKs provide functionality to collect multiple rounds of prompts and responses into a chat, giving you an easy way to keep track of the conversation history. Note: Chat functionality is only implemented as part of the SDKs. Behind the scenes, it still uses the generateContent API. For multi-turn conversations, the full conversation history is sent to the model with each follow-up turn. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message ( "I have 2 dogs in my house." ) print ( response . text ) response = chat . send_message ( "How many paws are in my house?" ) print ( response . text ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const response1 = await chat . sendMessage ({ message : "I have 2 dogs in my house." , }); console . log ( "Chat response 1:" , response1 . text ); const response2 = await chat . sendMessage ({ message : "How many paws are in \ No newline at end of file diff --git a/docstore/ed2770d5-59f4-4300-ad30-7bd7aca0e65e b/docstore/ed2770d5-59f4-4300-ad30-7bd7aca0e65e new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/ed2770d5-59f4-4300-ad30-7bd7aca0e65e @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/ed522cf1-dbb6-482c-b5be-99209f77ea26 b/docstore/ed522cf1-dbb6-482c-b5be-99209f77ea26 new file mode 100644 index 0000000000000000000000000000000000000000..7e8ad5dab4abf30afa5f2a7a1bb2bb58fcb2f5d0 --- /dev/null +++ b/docstore/ed522cf1-dbb6-482c-b5be-99209f77ea26 @@ -0,0 +1 @@ +-X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"' " $MIME_TYPE " '", "data": "' " $IMAGE_B64 " '" } }, {"text": "Caption this image."} ] }] }' 2 > /dev/null Note: Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly. Uploading images using the File API For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to generateContent . See the Files API guide for more information and examples. Python from google import genai client = genai . Client () my_file = client . files . upload ( file = "path/to/sample.jpg" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ my_file , "Caption this image." ], ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.jpg" , config : { mimeType : "image/jpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Caption this image." , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.jpg" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Caption this image." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * \ No newline at end of file diff --git a/docstore/ed68afb0-ac7b-4874-b12b-32f4aa1b2924 b/docstore/ed68afb0-ac7b-4874-b12b-32f4aa1b2924 new file mode 100644 index 0000000000000000000000000000000000000000..d81ab5fe97c881039bfa04085c408d0af305b78e --- /dev/null +++ b/docstore/ed68afb0-ac7b-4874-b12b-32f4aa1b2924 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/live-tools#main-content Title: Tool use with Live API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/ed6d6907-cdab-4421-a215-912785a795e4 b/docstore/ed6d6907-cdab-4421-a215-912785a795e4 new file mode 100644 index 0000000000000000000000000000000000000000..facd4f50718f089582ed06a7b9e2ea144ce56d9c --- /dev/null +++ b/docstore/ed6d6907-cdab-4421-a215-912785a795e4 @@ -0,0 +1 @@ +instructions, such as: "Show bounding boxes of all green objects in this image". It also support custom labels like "label the items with the allergens they can contain". For more examples, check following notebooks in the Gemini Cookbook : 2D spatial understanding notebook Experimental 3D pointing notebook Segmentation Starting with Gemini 2.5, models not only detect items but also segment them and provide their contour masks. The model predicts a JSON list, where each item represents a segmentation mask. Each item has a bounding box (" box_2d ") in the format [y0, x0, y1, x1] with normalized coordinates between 0 and 1000, a label (" label ") that identifies the object, and finally the segmentation mask inside the bounding box, as base64 encoded png that is a probability map with values between 0 and 255. The mask needs to be resized to match the bounding box dimensions, then binarized at your confidence threshold (127 for the midpoint). Note: For better results, disable thinking by setting the thinking budget to 0. See code sample below for an example. Python from google import genai from google.genai import types from PIL import Image , ImageDraw import io import base64 import json import numpy as np import os client = genai . Client () def parse_json ( json_output : str ): # Parsing out the markdown fencing lines = json_output . splitlines () for i , line in enumerate ( lines ): if line == "```json" : json_output = " \n " . join ( lines [ i + 1 :]) # Remove everything before "```json" output = json_output . split ( "```" )[ 0 ] # Remove everything after the closing "```" break # Exit the loop once "```json" is found return json_output def extract_segmentation_masks ( image_path : str , output_dir : str = "segmentation_outputs" ): # Load and resize image im = Image . open ( image_path ) im . thumbnail ([ 1024 , 1024 ], Image . Resampling . LANCZOS ) prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks \ No newline at end of file diff --git a/docstore/ed7d1a7d-2b7b-48c0-9157-94697bf9f0ba b/docstore/ed7d1a7d-2b7b-48c0-9157-94697bf9f0ba new file mode 100644 index 0000000000000000000000000000000000000000..cb7319bb995c708a315638a1177ba448f5c39210 --- /dev/null +++ b/docstore/ed7d1a7d-2b7b-48c0-9157-94697bf9f0ba @@ -0,0 +1 @@ +: "Powers the spinning disco ball." , "parameters" : { "type" : "object" , "properties" : { "power" : { "type" : "boolean" , "description" : "Whether to turn the disco ball on or off." , } }, "required" : [ "power" ], }, } start_music = { "name" : "start_music" , "description" : "Play some music matching the specified parameters." , "parameters" : { "type" : "object" , "properties" : { "energetic" : { "type" : "boolean" , "description" : "Whether the music is energetic or not." , }, "loud" : { "type" : "boolean" , "description" : "Whether the music is loud or not." , }, }, "required" : [ "energetic" , "loud" ], }, } dim_lights = { "name" : "dim_lights" , "description" : "Dim the lights." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "number" , "description" : "The brightness of the lights, 0.0 is off, 1.0 is full." , } }, "required" : [ "brightness" ], }, } JavaScript import { Type } from '@google/genai' ; const powerDiscoBall = { name : 'power_disco_ball' , description : 'Powers the spinning disco ball.' , parameters : { type : Type . OBJECT , properties : { power : { type : Type . BOOLEAN , description : 'Whether to turn the disco ball on or off.' } }, required : [ 'power' ] } }; const startMusic = { name : 'start_music' , description : 'Play some music matching the specified parameters.' , parameters : { type : Type . OBJECT , properties : { energetic : { type : Type . BOOLEAN , description : 'Whether the music is energetic or not.' }, loud : { type : Type . BOOLEAN , description : 'Whether the music is loud or not.' } }, required : [ 'energetic' , 'loud' ] } }; const dimLights = { name : 'dim_lights' , description : 'Dim the lights.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'The brightness of the lights, 0.0 is off, 1.0 is full.' } }, required : [ 'brightness' ] } }; Configure the function calling mode to allow using all of the specified tools. To learn more, \ No newline at end of file diff --git a/docstore/ed81bbda-ab2b-4098-ab64-0dfdf110a9e7 b/docstore/ed81bbda-ab2b-4098-ab64-0dfdf110a9e7 new file mode 100644 index 0000000000000000000000000000000000000000..acef50f8402c486c348013efe146de15b88cb32b --- /dev/null +++ b/docstore/ed81bbda-ab2b-4098-ab64-0dfdf110a9e7 @@ -0,0 +1 @@ +Function calling with the Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Function calling with the Gemini API Function calling lets you connect models to external tools and APIs. Instead of generating text responses, the model determines when to call specific functions and provides the necessary parameters to execute real-world actions. This allows the model to act as a bridge between natural language and real-world actions and data. Function calling has 3 primary use cases: Augment Knowledge: Access information from external sources like databases, APIs, and knowledge bases. Extend Capabilities: Use external tools to perform computations and extend the limitations of the model, such as using a calculator or creating charts. Take Actions: Interact with external systems using APIs, such as scheduling appointments, creating invoices, sending emails, or controlling smart home devices. Get Weather Schedule Meeting Create Chart How function calling works Function calling involves a structured interaction between your application, the model, and external functions. Here's a breakdown of the process: Define Function Declaration: Define the function declaration in your application code. Function Declarations describe the function's name, parameters, and purpose to the model. Call LLM with function declarations: Send user prompt along with the function declaration(s) to the model. It analyzes the request and determines if a function call would be helpful. If so, it responds with a structured JSON object. Execute Function Code (Your Responsibility): The Model does not execute the function itself. It's your application's responsibility to \ No newline at end of file diff --git a/docstore/ed9297dc-8afb-4d0b-8de6-04e9c3142611 b/docstore/ed9297dc-8afb-4d0b-8de6-04e9c3142611 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/ed9297dc-8afb-4d0b-8de6-04e9c3142611 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/edaec2d0-1eaf-40e3-bafe-e5447396455e b/docstore/edaec2d0-1eaf-40e3-bafe-e5447396455e new file mode 100644 index 0000000000000000000000000000000000000000..980cad742ce4bdad224c6b76fd35613451194dd7 --- /dev/null +++ b/docstore/edaec2d0-1eaf-40e3-bafe-e5447396455e @@ -0,0 +1 @@ +parameters : { type : Type . OBJECT , properties : { energetic : { type : Type . BOOLEAN , description : 'Whether the music is energetic or not.' }, loud : { type : Type . BOOLEAN , description : 'Whether the music is loud or not.' } }, required : [ 'energetic' , 'loud' ] } }; const dimLights = { name : 'dim_lights' , description : 'Dim the lights.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'The brightness of the lights, 0.0 is off, 1.0 is full.' } }, required : [ 'brightness' ] } }; Configure the function calling mode to allow using all of the specified tools. To learn more, you can read about configuring function calling . Python from google import genai from google.genai import types # Configure the client and tools client = genai . Client () house_tools = [ types . Tool ( function_declarations = [ power_disco_ball , start_music , dim_lights ]) ] config = types . GenerateContentConfig ( tools = house_tools , automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ), # Force the model to call 'any' function, instead of chatting. tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = 'ANY' ) ), ) chat = client . chats . create ( model = "gemini-2.5-flash" , config = config ) response = chat . send_message ( "Turn this place into a party!" ) # Print out each of the function calls requested from this single call print ( "Example 1: Forced function calling" ) for fn in response . function_calls : args = ", " . join ( f " { key } = { val } " for key , val in fn . args . items ()) print ( f " { fn . name } ( { args } )" ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Set up function declarations const houseFns = [ powerDiscoBall , startMusic , dimLights ]; const config = { tools : [{ functionDeclarations : houseFns }], // Force the model to call 'any' function, instead of chatting. toolConfig : { functionCallingConfig : \ No newline at end of file diff --git a/docstore/edb82cfa-2f7f-4614-976c-cc30ae1afae2 b/docstore/edb82cfa-2f7f-4614-976c-cc30ae1afae2 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/edb82cfa-2f7f-4614-976c-cc30ae1afae2 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/ee046bfe-a8ba-4915-b977-5150957686c2 b/docstore/ee046bfe-a8ba-4915-b977-5150957686c2 new file mode 100644 index 0000000000000000000000000000000000000000..448fb0d86f65fba6c1ebada31e49a4124dab31aa --- /dev/null +++ b/docstore/ee046bfe-a8ba-4915-b977-5150957686c2 @@ -0,0 +1 @@ +message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "16000.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . log ( "Transcription" ) console . log ( turn . serverContent . outputTranscription . text ); } } for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } else if ( turn . serverContent && turn . serverContent . inputTranscription ) { console . debug ( 'Received input transcription: %s\n' , turn . serverContent . inputTranscription . text ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Stream audio and video To see an example of how to use the Live API in a streaming audio and video format, run the "Live API - Get Started" file in the cookbooks repository: View on Colab Change voice and language The Live API models each support a different set of voices. Half-cascade supports Puck, Charon, Kore, Fenrir, Aoede, Leda, Orus, and Zephyr. Native audio supports a much longer list (identical to the TTS model list \ No newline at end of file diff --git a/docstore/ee09f2bb-3392-4c32-bcb5-53789e6d13bb b/docstore/ee09f2bb-3392-4c32-bcb5-53789e6d13bb new file mode 100644 index 0000000000000000000000000000000000000000..cc5e2a23a1aa933103609ebe99004920f218cf78 --- /dev/null +++ b/docstore/ee09f2bb-3392-4c32-bcb5-53789e6d13bb @@ -0,0 +1 @@ +"google.golang.org/genai" ) func main () { ctx := context . Background () client , _ := genai . NewClient ( ctx , & genai . ClientConfig { APIKey : os . Getenv ( "GEMINI_API_KEY" ), Backend : genai . BackendGeminiAPI , }) localPdfPath := "/path/to/file.pdf" uploadConfig := & genai . UploadFileConfig { MIMEType : "application/pdf" } uploadedFile , _ := client . Files . UploadFromPath ( ctx , localPdfPath , uploadConfig ) promptParts := [] * genai . Part { genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), genai . NewPartFromText ( "Give me a summary of this pdf file." ), } contents := [] * genai . Content { genai . NewContentFromParts ( promptParts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST NUM_BYTES = $( wc -c < " ${ PDF_PATH } " ) DISPLAY_NAME = TEXT tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files?key= ${ GEMINI_API_KEY } " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: application/pdf" \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ PDF_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl \ No newline at end of file diff --git a/docstore/ee0bd670-316d-445e-894e-a2ad410c51ba b/docstore/ee0bd670-316d-445e-894e-a2ad410c51ba new file mode 100644 index 0000000000000000000000000000000000000000..db99ecca5cc6f12a3e5f4ea036908e35449e2528 --- /dev/null +++ b/docstore/ee0bd670-316d-445e-894e-a2ad410c51ba @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models#imagen-3 Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/ee179a4c-d24b-4fbd-b08c-6ed690fb1570 b/docstore/ee179a4c-d24b-4fbd-b08c-6ed690fb1570 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/ee179a4c-d24b-4fbd-b08c-6ed690fb1570 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/ee2dea2a-687b-4ec9-a34d-1fbd3f4a7b1e b/docstore/ee2dea2a-687b-4ec9-a34d-1fbd3f4a7b1e new file mode 100644 index 0000000000000000000000000000000000000000..a76efec9a9a3e7390e77e9a866cc227646391c5b --- /dev/null +++ b/docstore/ee2dea2a-687b-4ec9-a34d-1fbd3f4a7b1e @@ -0,0 +1 @@ +Billing | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Billing This guide provides an overview of different Gemini API billing options, explains how to enable billing and monitor usage, and provides answers to frequently asked questions (FAQs) about billing. Upgrade to the Gemini API paid tier About billing Billing for the Gemini API is based on two pricing tiers: free of charge (or free ) and pay-as-you-go (or paid ). Pricing and rate limits differ between these tiers and also vary by model. You can check out the rate limits and pricing pages for more into. For a model-by-model breakdown of capabilities, see the Gemini models page . How to request an upgrade To transition from the free tier to the pay-as-you-go plan, you need to enable billing for your Google Cloud project. The button you see in Google AI Studio depends on your project's current plan. If you're on the free tier, you'll see a Set up Billing button for your project. If you're already on the paid tier and meet the criteria for a plan change, you might see an Upgrade button. To start the process, follow these steps: Go to the AI Studio API keys page . Find the project you want to move to the paid plan and click either Set up Billing or Upgrade , depending on the button displayed. The next step depends on the button you clicked: If you clicked Set up Billing: You'll be redirected to the Google Cloud console to link a billing account to your project. Follow the on-screen instructions to complete the process. If you clicked Upgrade: The system will automatically verify your project's eligibility. If your project meets all the requirements, it will be instantly upgraded to \ No newline at end of file diff --git a/docstore/ee53ec5d-0b36-44f5-8a14-70cd6368a918 b/docstore/ee53ec5d-0b36-44f5-8a14-70cd6368a918 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/ee53ec5d-0b36-44f5-8a14-70cd6368a918 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/ee749fbe-6903-45a9-999c-6c64820987ce b/docstore/ee749fbe-6903-45a9-999c-6c64820987ce new file mode 100644 index 0000000000000000000000000000000000000000..4542d68c23b70ac5014267f60edbcffd2138877f --- /dev/null +++ b/docstore/ee749fbe-6903-45a9-999c-6c64820987ce @@ -0,0 +1 @@ +, headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Streaming can also be used for multi-turn conversations. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message_stream ( "I have 2 dogs in my house." ) for chunk in response : print ( chunk . text , end = "" ) response = chat . send_message_stream ( "How many paws are in my house?" ) for chunk in response : print ( chunk . text , end = "" ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const stream1 = await chat . sendMessageStream ({ message : "I have 2 dogs in my house." , }); for await ( const chunk of stream1 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } const stream2 = await chat . sendMessageStream ({ message : "How many paws are in my house?" , }); for await ( const chunk of stream2 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . \ No newline at end of file diff --git a/docstore/ee91f36e-4fe6-49a9-9d9d-d08fb58a1ffb b/docstore/ee91f36e-4fe6-49a9-9d9d-d08fb58a1ffb new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/ee91f36e-4fe6-49a9-9d9d-d08fb58a1ffb @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/ee9c2a46-6fa3-4dbb-a04e-c88c60276e16 b/docstore/ee9c2a46-6fa3-4dbb-a04e-c88c60276e16 new file mode 100644 index 0000000000000000000000000000000000000000..bf4a48096b84622083d96343210f25866e78f754 --- /dev/null +++ b/docstore/ee9c2a46-6fa3-4dbb-a04e-c88c60276e16 @@ -0,0 +1 @@ +a picture of me. Can you add a llama next to me?" ), & genai . Part { InlineData : & genai . Blob { MIMEType : "image/png" , Data : imgData , }, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } config := & genai . GenerateContentConfig { ResponseModalities : [] string { "TEXT" , "IMAGE" }, } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash-preview-image-generation" , contents , config , ) for _ , part := range result . Candidates [ 0 ]. Content . Parts { if part . Text != "" { fmt . Println ( part . Text ) } else if part . InlineData != nil { imageBytes := part . InlineData . Data outputFilename := "gemini_generated_image.png" _ = os . WriteFile ( outputFilename , imageBytes , 0644 ) } } } REST IMG_PATH = /path/to/your/image1.jpeg if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMG_BASE64 = $( base64 " $B64FLAGS " " $IMG_PATH " 2>&1 ) curl -X POST \ "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-preview-image-generation:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d "{ \"contents\": [{ \"parts\":[ {\"text\": \"'Hi, This is a picture of me. Can you add a llama next to me\"}, { \"inline_data\": { \"mime_type\":\"image/jpeg\", \"data\": \" $IMG_BASE64 \" } } ] }], \"generationConfig\": {\"responseModalities\": [\"TEXT\", \"IMAGE\"]} }" \ | grep -o '"data": "[^"]*"' \ | cut -d '"' -f4 \ | base64 --decode > gemini-edited-image.png Other image generation modes Gemini supports other image interaction modes based on prompt structure and context, including: Text to image(s) and text (interleaved): Outputs images with related text. Example prompt: "Generate an illustrated recipe for a paella." Image(s) and text to image(s) and text (interleaved) : Uses input images and text to create new related images and text. Example prompt: (With an image of a furnished room) \ No newline at end of file diff --git a/docstore/eea332a4-2559-485d-b2fd-23bc7b1a7007 b/docstore/eea332a4-2559-485d-b2fd-23bc7b1a7007 new file mode 100644 index 0000000000000000000000000000000000000000..18e5380dd4144398b3d4c6273920669cbc2b0130 --- /dev/null +++ b/docstore/eea332a4-2559-485d-b2fd-23bc7b1a7007 @@ -0,0 +1 @@ +'gemini-2.0-flash' , contents = 'Tell me a story in 100 words.' , config = types . GenerateContentConfig ( system_instruction = 'you are a story teller for kids under 5 years old' , max_output_tokens = 400 , top_k = 2 , top_p = 0.5 , temperature = 0.5 , response_mime_type = 'application/json' , stop_sequences = [ ' \n ' ], seed = 42 , ), ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Tell me a story about a magic backpack." , config : { candidateCount : 1 , stopSequences : [ "x" ], maxOutputTokens : 20 , temperature : 1.0 , }, }); console . log ( response . text ); Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , genai . Text ( "Tell me about New York" ), & genai . GenerateContentConfig { Temperature : genai . Ptr [ float32 ]( 0.5 ), TopP : genai . Ptr [ float32 ]( 0.5 ), TopK : genai . Ptr [ float32 ]( 2.0 ), ResponseMIMEType : "application/json" , StopSequences : [] string { "Yankees" }, CandidateCount : 2 , Seed : genai . Ptr [ int32 ]( 42 ), MaxOutputTokens : 128 , PresencePenalty : genai . Ptr [ float32 ]( 0.5 ), FrequencyPenalty : genai . Ptr [ float32 ]( 0.5 ), }, ) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing response Safety settings Generate a response with safety settings: Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ( 'say something bad' , safety_settings = { 'HATE' : 'BLOCK_ONLY_HIGH' , 'HARASSMENT' : 'BLOCK_ONLY_HIGH' , } ) JavaScript import { GoogleGenerativeAI , HarmCategory , HarmBlockThreshold } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI \ No newline at end of file diff --git a/docstore/eeb4b1e4-3014-45ae-bbdb-4a6cdf151f6a b/docstore/eeb4b1e4-3014-45ae-bbdb-4a6cdf151f6a new file mode 100644 index 0000000000000000000000000000000000000000..ebc8fdc5ad27fd96758924c177eadfccc4d6556f --- /dev/null +++ b/docstore/eeb4b1e4-3014-45ae-bbdb-4a6cdf151f6a @@ -0,0 +1 @@ +Structured output | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Structured output You can configure Gemini for structured output instead of unstructured text, allowing precise extraction and standardization of information for further processing. For example, you can use structured output to extract information from resumes, standardize them to build a structured database. Gemini can generate either JSON or enum values as structured output. Generating JSON There are two ways to generate JSON using the Gemini API: Configure a schema on the model Provide a schema in a text prompt Configuring a schema on the model is the recommended way to generate JSON, because it constrains the model to output JSON. Configuring a schema (recommended) To constrain the model to generate JSON, configure a responseSchema . The model will then respond to any prompt with JSON-formatted output. Python from google import genai from pydantic import BaseModel class Recipe ( BaseModel ): recipe_name : str ingredients : list [ str ] client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "List a few popular cookie recipes, and include the amounts of ingredients." , config = { "response_mime_type" : "application/json" , "response_schema" : list [ Recipe ], }, ) # Use the response as a JSON string. print ( response . text ) # Use instantiated objects. my_recipes : list [ Recipe ] = response . parsed Note: Pydantic validators are not yet supported. If a pydantic.ValidationError occurs, it is suppressed, and .parsed may be empty/null. JavaScript import { GoogleGenAI , Type } from "@google/genai" ; const ai = \ No newline at end of file diff --git a/docstore/eed61d82-23d4-4256-80e3-dd3cd5af065d b/docstore/eed61d82-23d4-4256-80e3-dd3cd5af065d new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/eed61d82-23d4-4256-80e3-dd3cd5af065d @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/eedc98f7-0ae2-4a02-b7b3-6563e7dc9662 b/docstore/eedc98f7-0ae2-4a02-b7b3-6563e7dc9662 new file mode 100644 index 0000000000000000000000000000000000000000..0ae258e3f4ca40ea8954afdd9da06087e388f8d9 --- /dev/null +++ b/docstore/eedc98f7-0ae2-4a02-b7b3-6563e7dc9662 @@ -0,0 +1 @@ +generateContent ({ model : 'gemini-2.5-flash' , contents : content , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "io" "net/http" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , _ := genai . NewClient ( ctx , & genai . ClientConfig { APIKey : os . Getenv ( "GEMINI_API_KEY" ), Backend : genai . BackendGeminiAPI , }) docUrl1 := "https://arxiv.org/pdf/2312.11805" docUrl2 := "https://arxiv.org/pdf/2403.05530" localPath1 := "doc1_downloaded.pdf" localPath2 := "doc2_downloaded.pdf" respHttp1 , _ := http . Get ( docUrl1 ) defer respHttp1 . Body . Close () outFile1 , _ := os . Create ( localPath1 ) _ , _ = io . Copy ( outFile1 , respHttp1 . Body ) outFile1 . Close () respHttp2 , _ := http . Get ( docUrl2 ) defer respHttp2 . Body . Close () outFile2 , _ := os . Create ( localPath2 ) _ , _ = io . Copy ( outFile2 , respHttp2 . Body ) outFile2 . Close () uploadConfig1 := & genai . UploadFileConfig { MIMEType : "application/pdf" } uploadedFile1 , _ := client . Files . UploadFromPath ( ctx , localPath1 , uploadConfig1 ) uploadConfig2 := & genai . UploadFileConfig { MIMEType : "application/pdf" } uploadedFile2 , _ := client . Files . UploadFromPath ( ctx , localPath2 , uploadConfig2 ) promptParts := [] * genai . Part { genai . NewPartFromURI ( uploadedFile1 . URI , uploadedFile1 . MIMEType ), genai . NewPartFromURI ( uploadedFile2 . URI , uploadedFile2 . MIMEType ), genai . NewPartFromText ( "What is the difference between each of the " + "main benchmarks between these two papers? " + "Output these in a table." ), } contents := [] * genai . Content { genai . NewContentFromParts ( promptParts , genai . RoleUser ), } modelName := "gemini-2.5-flash" result , _ := client . Models . GenerateContent ( ctx , modelName , contents , nil , ) fmt . Println ( result . Text ()) } REST DOC_URL_1 = "https://arxiv.org/pdf/2312.11805" DOC_URL_2 = "https://arxiv.org/pdf/2403.05530" DISPLAY_NAME_1 = "Gemini_paper" \ No newline at end of file diff --git a/docstore/eee5ed0d-b2d0-4663-8726-0ec07e86df6f b/docstore/eee5ed0d-b2d0-4663-8726-0ec07e86df6f new file mode 100644 index 0000000000000000000000000000000000000000..2d7fb3592b6f7a6d7aadabbb231e86ce3c5bc44a --- /dev/null +++ b/docstore/eee5ed0d-b2d0-4663-8726-0ec07e86df6f @@ -0,0 +1 @@ +to some safety risks that can arise when using LLMs, and recommend emerging safety design and development recommendations. (Note that laws and regulations may also impose restrictions, but such considerations are beyond the scope of this guide.) The following steps are recommended when building applications with LLMs: Understanding the safety risks of your application Considering adjustments to mitigate safety risks Performing safety testing appropriate to your use case Soliciting feedback from users and monitoring usage The adjustment and testing phases should be iterative until you reach performance appropriate for your application. Understand the safety risks of your application In this context, safety is being defined as the ability of an LLM to avoid causing harm to its users, for example, by generating toxic language or content that promotes stereotypes. The models available through the Gemini API have been designed with Google’s AI principles in mind and your use of it is subject to the Generative AI Prohibited Use Policy . The API provides built-in safety filters to help address some common language model problems such as toxic language and hate speech, and striving for inclusiveness and avoidance of stereotypes. However, each application can pose a different set of risks to its users. So as the application owner, you are responsible for knowing your users and the potential harms your application may cause, and ensuring that your application uses LLMs safely and responsibly. As part of this assessment, you should consider the likelihood that harm could occur and determine its seriousness and mitigation steps. For example, an app that generates essays based on factual events would need to be more careful about avoiding misinformation, as compared to an app that generates fictional stories for entertainment. A good way to begin exploring potential safety risks is to research your end users, and others who might be affected by your application's results. This \ No newline at end of file diff --git a/docstore/ef40c6b5-05ec-44ef-83f4-6f5361e5d478 b/docstore/ef40c6b5-05ec-44ef-83f4-6f5361e5d478 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/ef40c6b5-05ec-44ef-83f4-6f5361e5d478 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/ef4f83c7-6069-4d8b-a448-032101c44e8a b/docstore/ef4f83c7-6069-4d8b-a448-032101c44e8a new file mode 100644 index 0000000000000000000000000000000000000000..37fa730aa9280f3cac34df0c8f8ecdd2b308e691 --- /dev/null +++ b/docstore/ef4f83c7-6069-4d8b-a448-032101c44e8a @@ -0,0 +1 @@ +operation . response . generated_videos ): client . files . download ( file = generated_video . video ) generated_video . video . save ( f "video { n } .mp4" ) # save the video JavaScript import { GoogleGenAI } from "@google/genai" ; import { createWriteStream } from "fs" ; import { Readable } from "stream" ; const ai = new GoogleGenAI ({}); async function main () { let operation = await ai . models . generateVideos ({ model : "veo-2.0-generate-001" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , config : { personGeneration : "dont_allow" , aspectRatio : "16:9" , }, }); while ( ! operation . done ) { await new Promise (( resolve ) = > setTimeout ( resolve , 10000 )); operation = await ai . operations . getVideosOperation ({ operation : operation , }); } operation . response ? . generatedVideos ? . forEach ( async ( generatedVideo , n ) = > { const resp = await fetch ( ` ${ generatedVideo . video ? . uri } & key=GEMINI_API_KEY` ); // append your API key const writer = createWriteStream ( `video ${ n } .mp4` ); Readable . fromWeb ( resp . body ). pipe ( writer ); }); } main (); Go package main import ( "context" "fmt" "os" "time" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } videoConfig := & genai . GenerateVideosConfig { AspectRatio : "16:9" , PersonGeneration : "dont_allow" , } operation , _ := client . Models . GenerateVideos ( ctx , "veo-2.0-generate-001" , "Panning wide shot of a calico kitten sleeping in the sunshine" , nil , videoConfig , ) for ! operation . Done { time . Sleep ( 20 * time . Second ) operation , _ = client . Operations . GetVideosOperation ( ctx , operation , nil ) } for n , video := range operation . Response . GeneratedVideos { client . Files . Download ( ctx , video . Video , nil ) fname := fmt . Sprintf ( "video_%d.mp4" , n ) _ = os . WriteFile ( fname , video . Video . VideoBytes , 0644 ) } } REST # \ No newline at end of file diff --git a/docstore/ef510463-a07f-4695-a00e-f8294930a889 b/docstore/ef510463-a07f-4695-a00e-f8294930a889 new file mode 100644 index 0000000000000000000000000000000000000000..a97f8e9b4e6ce1d85f8c4b9bed31a9676bb7c602 --- /dev/null +++ b/docstore/ef510463-a07f-4695-a00e-f8294930a889 @@ -0,0 +1 @@ +Files API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Files API The Gemini family of artificial intelligence (AI) models is built to handle various types of input data, including text, images, and audio. Since these models can handle more than one type or mode of data, the Gemini models are called multimodal models or explained as having multimodal capabilities . This guide shows you how to work with media files using the Files API. The basic operations are the same for audio files, images, videos, documents, and other supported file types. For file prompting guidance, check out the File prompt guide section. Upload a file You can use the Files API to upload a media file. Always use the Files API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20 MB. The following code uploads a file and then uses the file in a call to generateContent . Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp3" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ "Describe this audio clip" , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Describe this audio \ No newline at end of file diff --git a/docstore/ef62a3d3-6de6-4b27-af2f-191fa00c8412 b/docstore/ef62a3d3-6de6-4b27-af2f-191fa00c8412 new file mode 100644 index 0000000000000000000000000000000000000000..807111324633242e2ce44214eeb6131d61befa1a --- /dev/null +++ b/docstore/ef62a3d3-6de6-4b27-af2f-191fa00c8412 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/live-guide#native-audio-output-thinking Title: Live API capabilities guide | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/ef7e924c-84f2-471a-9db5-5eac485e356a b/docstore/ef7e924c-84f2-471a-9db5-5eac485e356a new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/ef7e924c-84f2-471a-9db5-5eac485e356a @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/ef9c43f8-f94f-4058-b0f3-2910ea17ddc0 b/docstore/ef9c43f8-f94f-4058-b0f3-2910ea17ddc0 new file mode 100644 index 0000000000000000000000000000000000000000..7c883bd368c31d390cf31dfb7ab8807048f20c67 --- /dev/null +++ b/docstore/ef9c43f8-f94f-4058-b0f3-2910ea17ddc0 @@ -0,0 +1 @@ +caching price Not available $0.075 (text / image / video) $0.25 (audio) $1.00 / 1,000,000 tokens per hour (storage price) Grounding with Google Search Free of charge, up to 500 RPD (limit shared with Flash-Lite RPD) 1,500 RPD (free, limit shared with Flash-Lite RPD), then $35 / 1,000 requests Live API Free of charge Input: $0.50 (text), $3.00 (audio / image [video]) Output: $2.00 (text), $12.00 (audio) Used to improve our products Yes No Gemini 2.5 Flash-Lite Preview Try it in Google AI Studio Our smallest and most cost effective model, built for at scale usage. Preview models may change before becoming stable and have more restrictive rate limits. Free Tier Paid Tier, per 1M tokens in USD Input price (text, image, video) Free of charge $0.10 (text / image / video) $0.50 (audio) Output price (including thinking tokens) Free of charge $0.40 Context caching price Not available $0.025 (text / image / video) $0.125 (audio) $1.00 / 1,000,000 tokens per hour (storage price) Grounding with Google Search Free of charge, up to 500 RPD (limit shared with Flash RPD) 1,500 RPD (free, limit shared with Flash RPD), then $35 / 1,000 requests Used to improve our products Yes No Gemini 2.5 Flash Native Audio Try it in Google AI Studio Our native audio models optimized for higher quality audio outputs with better pacing, voice naturalness, verbosity, and mood. Preview models may change before becoming stable and have more restrictive rate limits. Free Tier Paid Tier, per 1M tokens in USD Input price Not available $0.50 (text) $3.00 (audio / video) Output price (including thinking tokens) Not available $2.00 (text) $12.00 (audio) Used to improve our products Yes No Gemini 2.5 Flash Preview TTS Try it in Google AI Studio Our 2.5 Flash text-to-speech audio model optimized for price-performant, low-latency, controllable speech generation. Preview models may change before becoming stable and have more restrictive rate limits. Free Tier Paid Tier, per 1M tokens in USD Input price Free of \ No newline at end of file diff --git a/docstore/efa09635-7f89-4044-ac20-ff2fc0e771c7 b/docstore/efa09635-7f89-4044-ac20-ff2fc0e771c7 new file mode 100644 index 0000000000000000000000000000000000000000..136dba231235fa237f8f31f4cf4123c5c10a2f24 --- /dev/null +++ b/docstore/efa09635-7f89-4044-ac20-ff2fc0e771c7 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/embeddings#main-content Title: Embeddings | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/efda4ec3-f8f6-47a9-b006-2f6adab0640b b/docstore/efda4ec3-f8f6-47a9-b006-2f6adab0640b new file mode 100644 index 0000000000000000000000000000000000000000..69f7399c35aaaad68e1bd1a996c44353577b3a79 --- /dev/null +++ b/docstore/efda4ec3-f8f6-47a9-b006-2f6adab0640b @@ -0,0 +1 @@ +the user. if message . server_content and message . server_content . turn_complete : break if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } console . debug ( 'Connecting to the service with handle %s...' , previousSessionHandle ) const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : { responseModalities : [ Modality . TEXT ], sessionResumption : { handle : previousSessionHandle } // The handle of the session to resume is passed here, or else null to start a new session. } }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . sessionResumptionUpdate ) { if ( turn . sessionResumptionUpdate . resumable && turn . sessionResumptionUpdate . newHandle ) { let newHandle = turn . sessionResumptionUpdate . newHandle // ...Store newHandle and start new session with this handle here } } } session . close (); } async function main () { await \ No newline at end of file diff --git a/docstore/efee6f5c-1457-4008-a025-5a02e7d651d5 b/docstore/efee6f5c-1457-4008-a025-5a02e7d651d5 new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/efee6f5c-1457-4008-a025-5a02e7d651d5 @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/eff13772-81d2-44e0-b2ab-ea2dff3e8796 b/docstore/eff13772-81d2-44e0-b2ab-ea2dff3e8796 new file mode 100644 index 0000000000000000000000000000000000000000..d028e190ded4732dff341fe647fe7532c210272b --- /dev/null +++ b/docstore/eff13772-81d2-44e0-b2ab-ea2dff3e8796 @@ -0,0 +1 @@ +second image shows an empty hallway in a shopping mall, which is a place where people often go to buy things for their trips. The third image shows a stack of toilet paper, which is something that people often use when they are traveling. The fourth image shows a dark and stormy sea, which is something that people often encounter when they are traveling. The same tips work for creative output from images. In the following example with the rabbit, explicitly asking the model to include aspects from the image can improve the output. Prompt Model response Can you write me a poem about this image? Little bunny, so soft and brown, Hopping through the field, up and down. Your ears so long, your tail so small, You're the cutest creature of them all. You nibble on the grass, so green and sweet, And hop away when you see me. But I'll always be your friend, Little bunny, until the very end. Updated prompt Improved response Can you write me a descriptive and dramatic poem about this image and include the location? Little bunny on the green, Ears so long, so keen. What do you see out there, In the distance, where you stare? Is it the ocean blue, Sparkling in the sun's hue? Or the waves that crash and roar, A rhythm that you adore? Little bunny, tell me true, What captures your attention so? Is it the salty breeze, Or the seagulls' cries that please? Perhaps it's the vastness of it all, A world so big and small. A place where dreams can roam, And adventures can be sown. Little bunny, I wonder what you dream, As you sit on the grass, so serene. Do you yearn to explore the deep, Or stay on land, where you can leap? Whatever it may be, little bunny, Keep that spark of wonder burning brightly. For in your dreams and aspirations, Lies a world waiting for your creation. Troubleshooting which part of the prompt failed It can be hard to know whether a prompt failed because the model didn't understand the image to begin with, or if it did understand the image but did not perform the \ No newline at end of file diff --git a/docstore/eff41023-49a6-4c0c-9cc8-6d081aa3cc5c b/docstore/eff41023-49a6-4c0c-9cc8-6d081aa3cc5c new file mode 100644 index 0000000000000000000000000000000000000000..2dd19cc1c2d77863f4a301d62456871308dd74fd --- /dev/null +++ b/docstore/eff41023-49a6-4c0c-9cc8-6d081aa3cc5c @@ -0,0 +1 @@ +"temperature" ], }, }, ], }, ]; // Prompt for the model let contents = [ { role : "user" , parts : [ { text : "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , }, ], }, ]; // Loop until the model has no more function calls to make while ( true ) { const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents , config : { tools }, }); if ( result . functionCalls && result . functionCalls . length > 0 ) { const functionCall = result . functionCalls [ 0 ]; const { name , args } = functionCall ; if ( ! toolFunctions [ name ]) { throw new Error ( `Unknown function call: ${ name } ` ); } // Call the function and get the response. const toolResponse = toolFunctions [ name ]( args ); const functionResponsePart = { name : functionCall . name , response : { result : toolResponse , }, }; // Send the function response back to the model. contents . push ({ role : "model" , parts : [ { functionCall : functionCall , }, ], }); contents . push ({ role : "user" , parts : [ { functionResponse : functionResponsePart , }, ], }); } else { // No more function calls, break the loop. console . log ( result . text ); break ; } } Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . It 's 25°C in London, so I' ve set the thermostat to 20 ° C . Compositional function calling is a native Live API feature. This means Live API can handle the function calling similar to the Python SDK. Python # Light control schemas turn_on_the_lights_schema = { 'name' : 'turn_on_the_lights' } \ No newline at end of file diff --git a/docstore/f018cd0d-cb55-40ab-bf2f-4062eacf7e36 b/docstore/f018cd0d-cb55-40ab-bf2f-4062eacf7e36 new file mode 100644 index 0000000000000000000000000000000000000000..56f7dcc9bceb6e3744499a44a4d77c57b76426e8 --- /dev/null +++ b/docstore/f018cd0d-cb55-40ab-bf2f-4062eacf7e36 @@ -0,0 +1 @@ += "gemini-2.5-flash" , contents = [ "Explain how AI works" ], config = types . GenerateContentConfig ( temperature = 0.1 ) ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , config : { temperature : 0.1 , }, }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } temp := float32 ( 0.9 ) topP := float32 ( 0.5 ) topK := float32 ( 20.0 ) config := & genai . GenerateContentConfig { Temperature : & temp , TopP : & topP , TopK : & topK , ResponseMIMEType : "application/json" , } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "What is the average size of a swallow?" ), config , ) fmt . Println ( result . Text ()) } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ], "generationConfig": { "stopSequences": [ "Title" ], "temperature": 1.0, "topP": 0.8, "topK": 10 } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const generationConfig = { temperature : 1 , topP : 0.95 , topK : 40 , responseMimeType : 'text/plain' , }; const payload = { generationConfig , contents : [ { parts : [ { text : 'Explain how AI works in a few words' }, ], }, ], }; const url = \ No newline at end of file diff --git a/docstore/f0510107-8694-491a-befd-216e37331cb4 b/docstore/f0510107-8694-491a-befd-216e37331cb4 new file mode 100644 index 0000000000000000000000000000000000000000..5b10a49a34afcc5006e0bf4f1bcb0c14355ae334 --- /dev/null +++ b/docstore/f0510107-8694-491a-befd-216e37331cb4 @@ -0,0 +1 @@ +environment includes the following libraries: attrs chess contourpy fpdf geopandas imageio jinja2 joblib jsonschema jsonschema-specifications lxml matplotlib mpmath numpy opencv-python openpyxl packaging pandas pillow protobuf pylatex pyparsing PyPDF2 python-dateutil python-docx python-pptx reportlab scikit-learn scipy seaborn six striprtf sympy tabulate tensorflow toolz xlrd You can't install your own libraries. Note: Only matplotlib is supported for graph rendering using code execution. What's next Try the code execution Colab . Learn about other Gemini API tools: Function calling Grounding with Google Search Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/f061291a-9e6f-4dfc-9625-a11286b13f24 b/docstore/f061291a-9e6f-4dfc-9625-a11286b13f24 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/f061291a-9e6f-4dfc-9625-a11286b13f24 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/f0734a2c-70d8-48dd-ab05-ced8349feb1a b/docstore/f0734a2c-70d8-48dd-ab05-ced8349feb1a new file mode 100644 index 0000000000000000000000000000000000000000..54ee11bfb756db29fb776eb5a6d4247407dfa205 --- /dev/null +++ b/docstore/f0734a2c-70d8-48dd-ab05-ced8349feb1a @@ -0,0 +1 @@ +(Default/Some Thinking): Many common requests benefit from a degree of step-by-step processing or deeper understanding. Gemini can flexibly use thinking capability for tasks like: Analogize photosynthesis and growing up. Compare and contrast electric cars and hybrid cars. Hard Tasks (Maximum Thinking Capability): For truly complex challenges, such as solving complex math problems or coding tasks, we recommend setting a high thinking budget. These types of tasks require the model to engage its full reasoning and planning capabilities, often involving many internal steps before providing an answer. Examples include: Solve problem 1 in AIME 2025: Find the sum of all integer bases b > 9 for which 17 b is a divisor of 97 b . Write Python code for a web application that visualizes real-time stock market data, including user authentication. Make it as efficient as possible. Thinking with tools and capabilities Thinking models work with all of Gemini's tools and capabilities. This allows the models to interact with external systems, execute code, or access real-time information, incorporating the results into their reasoning and final response. The search tool allows the model to query Google Search to find up-to-date information or information beyond its training data. This is useful for questions about recent events or highly specific topics. The code execution tool enables the model to generate and run Python code to perform calculations, manipulate data, or solve problems that are best handled algorithmically. The model receives the code's output and can use it in its response. With structured output , you can constrain Gemini to respond with JSON. This is particularly useful for integrating the model's output into applications. Function calling connects the thinking model to external tools and APIs, so it can reason about when to call the right function and what parameters to provide. URL Context provides the model with URLs as additional context for your prompt. The \ No newline at end of file diff --git a/docstore/f07c4d4e-69c2-4d35-b31e-106c1941ea93 b/docstore/f07c4d4e-69c2-4d35-b31e-106c1941ea93 new file mode 100644 index 0000000000000000000000000000000000000000..b8d7dab8b59ea83c8480687d32380faf07bab32f --- /dev/null +++ b/docstore/f07c4d4e-69c2-4d35-b31e-106c1941ea93 @@ -0,0 +1 @@ +google.generativeai as genai # Directly create and use model objects model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ( ... ) chat = model . start_chat ( ... ) JavaScript While GoogleGenerativeAI was a central point for models and chat, other functionalities like file and cache management often required importing and instantiating entirely separate client classes. import { GoogleGenerativeAI } from "@google/generative-ai" ; import { GoogleAIFileManager , GoogleAICacheManager } from "@google/generative-ai/server" ; // For files/caching const genAI = new GoogleGenerativeAI ( "YOUR_API_KEY" ); const fileManager = new GoogleAIFileManager ( "YOUR_API_KEY" ); const cacheManager = new GoogleAICacheManager ( "YOUR_API_KEY" ); // Get a model instance, then call methods on it const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); const result = await model . generateContent (...); const chat = model . startChat (...); // Call methods on separate client objects for other services const uploadedFile = await fileManager . uploadFile (...); const cache = await cacheManager . create (...); Go The genai.NewClient function created a client, but generative model operations were typically called on a separate GenerativeModel instance obtained from this client. Other services might have been accessed via distinct packages or patterns. import ( "github.com/google/generative-ai-go/genai" "github.com/google/generative-ai-go/genai/fileman" // For files "google.golang.org/api/option" ) client , err := genai . NewClient ( ctx , option . WithAPIKey ( "YOUR_API_KEY" )) fileClient , err := fileman . NewClient ( ctx , option . WithAPIKey ( "YOUR_API_KEY" )) // Get a model instance, then call methods on it model := client . GenerativeModel ( "gemini-1.5-flash" ) resp , err := model . GenerateContent ( ... ) cs := model . StartChat () // Call methods on separate client objects for other services uploadedFile , err := fileClient . \ No newline at end of file diff --git a/docstore/f0860c43-afb8-4129-bd3f-98d053d3144e b/docstore/f0860c43-afb8-4129-bd3f-98d053d3144e new file mode 100644 index 0000000000000000000000000000000000000000..41c5d7c70c10b0c099f849b39a650a62d6333896 --- /dev/null +++ b/docstore/f0860c43-afb8-4129-bd3f-98d053d3144e @@ -0,0 +1 @@ +npm install @modelcontextprotocol/sdk Note: JavaScript supports automatic tool calling by wrapping the client with mcpToTool . If you want to disable it, you can provide automaticFunctionCalling with disabled true . import { GoogleGenAI , FunctionCallingConfigMode , mcpToTool } from '@google/genai' ; import { Client } from "@modelcontextprotocol/sdk/client/index.js" ; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js" ; // Create server parameters for stdio connection const serverParams = new StdioClientTransport ({ command : "npx" , // Executable args : [ "-y" , "@philschmid/weather-mcp" ] // MCP Server }); const client = new Client ( { name : "example-client" , version : "1.0.0" } ); // Configure the client const ai = new GoogleGenAI ({}); // Initialize the connection between client and server await client . connect ( serverParams ); // Send request to the model with MCP tools const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : `What is the weather in London in ${ new Date (). toLocaleDateString () } ?` , config : { tools : [ mcpToTool ( client )], // uses the session, will automatically call the tool // Uncomment if you **don't** want the sdk to automatically call the tool // automaticFunctionCalling: { // disable: true, // }, }, }); console . log ( response . text ) // Close the connection await client . close (); Limitations with built-in MCP support Built-in MCP support is a experimental feature in our SDKs and has the following limitations: Only tools are supported, not resources nor prompts It is available for the Python and JavaScript/TypeScript SDK. Breaking changes might occur in future releases. Manual integration of MCP servers is always an option if these limit what you're building. Supported models This section lists models and their function calling capabilities. Experimental models are not included. You can find a comprehensive capabilities overview on the model overview page. \ No newline at end of file diff --git a/docstore/f088f1a5-427f-42b8-8ab3-5bdd6d876280 b/docstore/f088f1a5-427f-42b8-8ab3-5bdd6d876280 new file mode 100644 index 0000000000000000000000000000000000000000..7ad07eb45fff1ffd88928a8c1191c40c43412859 --- /dev/null +++ b/docstore/f088f1a5-427f-42b8-8ab3-5bdd6d876280 @@ -0,0 +1 @@ +public domain and does not show identifiable people. ( NASA image and media usage guidelines. ) The following code downloads the sample video, uploads it using the File API, waits for it to be processed, and then uses the file reference in a generateContent request. Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp4" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ myfile , "Summarize this video. Then create a quiz with an answer key based on the information in this video." ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp4" , config : { mimeType : "video/mp4" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Summarize this video. Then create a quiz with an answer key based on the information in this video." , ]), }); console . log ( response . text ); } await main (); Go uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.mp4" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Summarize this video. Then create a quiz with an answer key based on the information in this video." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST VIDEO_PATH = "path/to/sample.mp4" MIME_TYPE = $( file -b --mime-type " ${ VIDEO_PATH } " ) NUM_BYTES = $( wc -c < " ${ VIDEO_PATH } " ) DISPLAY_NAME = VIDEO tmp_header_file = upload-header.tmp echo "Starting file \ No newline at end of file diff --git a/docstore/f09a747b-581e-418d-bace-f4692848da20 b/docstore/f09a747b-581e-418d-bace-f4692848da20 new file mode 100644 index 0000000000000000000000000000000000000000..d31375a87cef28a1721f2cadc796b07b9730a33c --- /dev/null +++ b/docstore/f09a747b-581e-418d-bace-f4692848da20 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting_with_media#main-content Title: Files API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/f0a6d7ca-e128-4747-b3a3-4483f68a255f b/docstore/f0a6d7ca-e128-4747-b3a3-4483f68a255f new file mode 100644 index 0000000000000000000000000000000000000000..dd1226540612f04fb3f971567b47c61067071189 --- /dev/null +++ b/docstore/f0a6d7ca-e128-4747-b3a3-4483f68a255f @@ -0,0 +1 @@ +This example shows you how to specify a subject description. Subject description Prompt Generated output The description can include a subject, or multiple subjects and actions. Here, our subject is "white concrete apartment building." An architectural rendering of a white concrete apartment building with flowing organic shapes, seamlessly blending with lush greenery and futuristic elements Context This example shows you how to specify context. Context Prompt Generated output The background or context in which the subject will be placed is very important. Try placing your subject in a variety of backgrounds like on a busy street, or in outer space. A satellite floating through outer space with the moon and some stars in the background. Action This example shows you how to specify action. Action Prompt Generated output What is the subject doing like walking, running, or turning their head. A wide shot of a woman walking along the beach, looking content and relaxed towards the horizon at sunset. Style This example shows you how to specify style. Style Prompt Generated output You can add keywords to improve generation quality and steer it closer to intended style, such as shallow depth of field, movie still, minimalistic, surreal, vintage, futuristic, or double-exposure. Film noir style, man and woman walk on the street, mystery, cinematic, black and white. Camera motion This example shows you how to specify camera motion. Camera motion Prompt Generated output Options for camera motion include POV shot, aerial view, tracking drone view, or tracking shot. A POV shot from a vintage car driving in the rain, Canada at night, cinematic. Composition This example shows you how to specify composition. Composition Prompt Generated output How the shot is framed (wide shot, close-up, low angle). Extreme close-up of a an eye with city reflected in it. Create a video of a wide shot of surfer walking on a beach with a surfboard, beautiful sunset, cinematic. Ambiance This example \ No newline at end of file diff --git a/docstore/f0a98fcf-0647-4933-bea8-66f48949511a b/docstore/f0a98fcf-0647-4933-bea8-66f48949511a new file mode 100644 index 0000000000000000000000000000000000000000..a0fb95ef06df73f23ea2a2b8453418ec4557d8c9 --- /dev/null +++ b/docstore/f0a98fcf-0647-4933-bea8-66f48949511a @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models#gemini-2.0-flash-preview-image-generation Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/f0e0bf22-44a3-41af-b484-6820eeca173e b/docstore/f0e0bf22-44a3-41af-b484-6820eeca173e new file mode 100644 index 0000000000000000000000000000000000000000..d1c2e2cc31f0202ca4bec0cd65c14ecc40b8547a --- /dev/null +++ b/docstore/f0e0bf22-44a3-41af-b484-6820eeca173e @@ -0,0 +1 @@ +Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version \ No newline at end of file diff --git a/docstore/f10101f9-63c9-4638-8df5-72bfb0ac3376 b/docstore/f10101f9-63c9-4638-8df5-72bfb0ac3376 new file mode 100644 index 0000000000000000000000000000000000000000..066e9da5e5532baacb51b9c64378b6b187ca260c --- /dev/null +++ b/docstore/f10101f9-63c9-4638-8df5-72bfb0ac3376 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models#rate-limits Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/f116eba6-9bf3-4893-b456-d5b2f70a99c4 b/docstore/f116eba6-9bf3-4893-b456-d5b2f70a99c4 new file mode 100644 index 0000000000000000000000000000000000000000..2dd19cc1c2d77863f4a301d62456871308dd74fd --- /dev/null +++ b/docstore/f116eba6-9bf3-4893-b456-d5b2f70a99c4 @@ -0,0 +1 @@ +"temperature" ], }, }, ], }, ]; // Prompt for the model let contents = [ { role : "user" , parts : [ { text : "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , }, ], }, ]; // Loop until the model has no more function calls to make while ( true ) { const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents , config : { tools }, }); if ( result . functionCalls && result . functionCalls . length > 0 ) { const functionCall = result . functionCalls [ 0 ]; const { name , args } = functionCall ; if ( ! toolFunctions [ name ]) { throw new Error ( `Unknown function call: ${ name } ` ); } // Call the function and get the response. const toolResponse = toolFunctions [ name ]( args ); const functionResponsePart = { name : functionCall . name , response : { result : toolResponse , }, }; // Send the function response back to the model. contents . push ({ role : "model" , parts : [ { functionCall : functionCall , }, ], }); contents . push ({ role : "user" , parts : [ { functionResponse : functionResponsePart , }, ], }); } else { // No more function calls, break the loop. console . log ( result . text ); break ; } } Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . It 's 25°C in London, so I' ve set the thermostat to 20 ° C . Compositional function calling is a native Live API feature. This means Live API can handle the function calling similar to the Python SDK. Python # Light control schemas turn_on_the_lights_schema = { 'name' : 'turn_on_the_lights' } \ No newline at end of file diff --git a/docstore/f1182004-f9b6-4bbe-a111-afc758f51c5d b/docstore/f1182004-f9b6-4bbe-a111-afc758f51c5d new file mode 100644 index 0000000000000000000000000000000000000000..c60a398b68d2fb158c62411b9f70b1da071d4fb4 --- /dev/null +++ b/docstore/f1182004-f9b6-4bbe-a111-afc758f51c5d @@ -0,0 +1 @@ +response back to the model in a subsequent request, continuing this loop until no more tool calls are made by the model. Here, you can find an example of how to use a local MCP server with Gemini and mcp SDK. Python Make sure the latest version of the mcp SDK is installed on your platform of choice. pip install mcp Note: Python supports automatic tool calling by passing in the ClientSession into the tools parameters. If you want to disable it, you can provide automatic_function_calling with disabled True . import os import asyncio from datetime import datetime from mcp import ClientSession , StdioServerParameters from mcp.client.stdio import stdio_client from google import genai client = genai . Client () # Create server parameters for stdio connection server_params = StdioServerParameters ( command = "npx" , # Executable args = [ "-y" , "@philschmid/weather-mcp" ], # MCP Server env = None , # Optional environment variables ) async def run (): async with stdio_client ( server_params ) as ( read , write ): async with ClientSession ( read , write ) as session : # Prompt to get the weather for the current day in London. prompt = f "What is the weather in London in { datetime . now () . strftime ( '%Y-%m- %d ' ) } ?" # Initialize the connection between client and server await session . initialize () # Send request to the model with MCP function declarations response = await client . aio . models . generate_content ( model = "gemini-2.5-flash" , contents = prompt , config = genai . types . GenerateContentConfig ( temperature = 0 , tools = [ session ], # uses the session, will automatically call the tool # Uncomment if you **don't** want the SDK to automatically call the tool # automatic_function_calling=genai.types.AutomaticFunctionCallingConfig( # disable=True # ), ), ) print ( response . text ) # Start the asyncio event loop and run the main function asyncio . run ( run ()) JavaScript Make sure the latest version of the mcp SDK is installed on your platform of choice. \ No newline at end of file diff --git a/docstore/f11d91fa-f580-4ae6-a712-a4c3e0fd83ca b/docstore/f11d91fa-f580-4ae6-a712-a4c3e0fd83ca new file mode 100644 index 0000000000000000000000000000000000000000..a70be0d04647e3405f67cb87b3156d8922b7c210 --- /dev/null +++ b/docstore/f11d91fa-f580-4ae6-a712-a4c3e0fd83ca @@ -0,0 +1 @@ +to the examples provides labels that the model can use when generating the output, which makes it easier to parse output content. In the following example, "Text:" is the input prefix and "The answer is:" is the output prefix. Prompt: Classify the text as one of the following categories. - large - small Text: Rhino The answer is: large Text: Mouse The answer is: small Text: Snail The answer is: small Text: Elephant The answer is: Response: The answer is: large (gemini-2.5-flash) Break down prompts into components For use cases that require complex prompts, you can help the model manage this complexity by breaking things down into simpler components. Break down instructions: Instead of having many instructions in one prompt, create one prompt per instruction. You can choose which prompt to process based on the user's input. Chain prompts: For complex tasks that involve multiple sequential steps, make each step a prompt and chain the prompts together in a sequence. In this sequential chain of prompts, the output of one prompt in the sequence becomes the input of the next prompt. The output of the last prompt in the sequence is the final output. Aggregate responses: Aggregation is when you want to perform different parallel tasks on different portions of the data and aggregate the results to produce the final output. For example, you can tell the model to perform one operation on the first part of the data, perform another operation on the rest of the data and aggregate the results. Experiment with model parameters Each call that you send to a model includes parameter values that control how the model generates a response. The model can generate different results for different parameter values. Experiment with different parameter values to get the best values for the task. The parameters available for different models may differ. The most common parameters are the following: Max output tokens: Specifies the maximum number of tokens that can be generated in the \ No newline at end of file diff --git a/docstore/f1305d98-5ee4-4f15-9943-77d023137982 b/docstore/f1305d98-5ee4-4f15-9943-77d023137982 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/f1305d98-5ee4-4f15-9943-77d023137982 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/f138f0f4-bb53-4c9e-b1d4-97abbdd80864 b/docstore/f138f0f4-bb53-4c9e-b1d4-97abbdd80864 new file mode 100644 index 0000000000000000000000000000000000000000..987408a9d5fce99ac4990dd87d63b0d860b9553d --- /dev/null +++ b/docstore/f138f0f4-bb53-4c9e-b1d4-97abbdd80864 @@ -0,0 +1 @@ +overwhelming dominance of digital streaming and on-demand services; a "smart" approach would therefore require a highly specialized niche, a strong value proposition beyond just the disc, and likely a focus on community or collector markets rather than mainstream consumption. Here's a breakdown of what a smart DVD business in 2025 would likely entail: 1. Extreme Niche Specialization: You cannot compete with streaming libraries. Your business must focus on what digital doesn't offer easily or at all. This could include: * Out-of-Print & Rare Titles: Sourcing films or TV shows that have never been released digitally or are currently unavailable on major streaming platforms. .... (gemini-2.5-flash) Format responses with the completion strategy The completion strategy can also help format the response. The following example prompts the model to create an essay outline: Prompt: Create an outline for an essay about hummingbirds. Response: Here is a possible outline for an essay about hummingbirds: I. Introduction A. Hook: Start with a captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). B. Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds. C. Thesis Statement: State the main point of the essay, which could be about the remarkable adaptations, behaviors, and ecological importance of hummingbirds. II. Appearance and Identification ..... (gemini-2.5-flash) The prompt didn't specify the format for the outline and the model chose a format for you. To get the model to return an outline in a specific format, you can add text that represents the start of the outline and let the model complete it based on the pattern that you initiated. Prompt: Create an outline for an essay about hummingbirds. I. Introduction * Response: Okay, here is an outline for an essay about hummingbirds, starting from the introduction you provided: I. Introduction * Hook: Start with a \ No newline at end of file diff --git a/docstore/f15d3185-5a14-4622-b61e-5364eb9577ba b/docstore/f15d3185-5a14-4622-b61e-5364eb9577ba new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/f15d3185-5a14-4622-b61e-5364eb9577ba @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/f17e112f-be25-4965-9abc-7f550d68c313 b/docstore/f17e112f-be25-4965-9abc-7f550d68c313 new file mode 100644 index 0000000000000000000000000000000000000000..0a87c36dd70c54ff062f2d96388cf5344fb768a1 --- /dev/null +++ b/docstore/f17e112f-be25-4965-9abc-7f550d68c313 @@ -0,0 +1 @@ +process the response and check for Function Call, if Yes : Extract the name and args of the function and execute the corresponding function in your application. No: The model has provided a direct text response to the prompt (this flow is less emphasized in the example but is a possible outcome). Create User friendly response: If a function was executed, capture the result and send it back to the model in a subsequent turn of the conversation. It will use the result to generate a final, user-friendly response that incorporates the information from the function call. This process can be repeated over multiple turns, allowing for complex interactions and workflows. The model also supports calling multiple functions in a single turn ( parallel function calling ) and in sequence ( compositional function calling ). Step 1: Define a function declaration Define a function and its declaration within your application code that allows users to set light values and make an API request. This function could call external services or APIs. Python # Define a function that the model can call to control smart lights set_light_values_declaration = { "name" : "set_light_values" , "description" : "Sets the brightness and color temperature of a light." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "integer" , "description" : "Light level from 0 to 100. Zero is off and 100 is full brightness" , }, "color_temp" : { "type" : "string" , "enum" : [ "daylight" , "cool" , "warm" ], "description" : "Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`." , }, }, "required" : [ "brightness" , "color_temp" ], }, } # This is the actual function that would be called based on the model's suggestion def set_light_values ( brightness : int , color_temp : str ) - > dict [ str , int | str ]: """Set the brightness and color temperature of a room light. (mock API). Args: brightness: Light level from 0 to 100. Zero is off and 100 is full \ No newline at end of file diff --git a/docstore/f197264c-b282-4dcd-a5bf-1a21bfb44f20 b/docstore/f197264c-b282-4dcd-a5bf-1a21bfb44f20 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/f197264c-b282-4dcd-a5bf-1a21bfb44f20 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/f1a9edc1-f639-434c-99dd-6ed5176226bb b/docstore/f1a9edc1-f639-434c-99dd-6ed5176226bb new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/f1a9edc1-f639-434c-99dd-6ed5176226bb @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/f1b0f23c-5304-41fd-98a6-14aa58cac2eb b/docstore/f1b0f23c-5304-41fd-98a6-14aa58cac2eb new file mode 100644 index 0000000000000000000000000000000000000000..deed43be9d78353ae146822eb2d40897035c76a7 --- /dev/null +++ b/docstore/f1b0f23c-5304-41fd-98a6-14aa58cac2eb @@ -0,0 +1 @@ +"What other color sofas would work in my space? can you update the image?" Multi-turn image editing (chat): Keep generating / editing images conversationally. Example prompts: [upload an image of a blue car.] , "Turn this car into a convertible.", "Now change the color to yellow." Limitations For best performance, use the following languages: EN, es-MX, ja-JP, zh-CN, hi-IN. Image generation does not support audio or video inputs. Image generation may not always trigger: The model may output text only. Try asking for image outputs explicitly (e.g. "generate an image", "provide images as you go along", "update the image"). The model may stop generating partway through. Try again or try a different prompt. When generating text for an image, Gemini works best if you first generate the text and then ask for an image with the text. There are some regions/countries where Image generation is not available. See Models for more information. Generate images using the Imagen models This example demonstrates generating images with an Imagen model : Python from google import genai from google.genai import types from PIL import Image from io import BytesIO client = genai . Client () response = client . models . generate_images ( model = 'imagen-4.0-generate-preview-06-06' , prompt = 'Robot holding a red skateboard' , config = types . GenerateImagesConfig ( number_of_images = 4 , ) ) for generated_image in response . generated_images : generated_image . image . show () JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . generateImages ({ model : 'imagen-4.0-generate-preview-06-06' , prompt : 'Robot holding a red skateboard' , config : { numberOfImages : 4 , }, }); let idx = 1 ; for ( const generatedImage of response . generatedImages ) { let imgBytes = generatedImage . image . imageBytes ; const buffer = Buffer . from ( imgBytes , "base64" ); fs . \ No newline at end of file diff --git a/docstore/f1d72fe2-d8ec-41c9-aeb2-cf0c8c08a770 b/docstore/f1d72fe2-d8ec-41c9-aeb2-cf0c8c08a770 new file mode 100644 index 0000000000000000000000000000000000000000..8ae6e7f8faf81825a858a0fb6935f6e2a3e8dc09 --- /dev/null +++ b/docstore/f1d72fe2-d8ec-41c9-aeb2-cf0c8c08a770 @@ -0,0 +1 @@ +help with that, as I'm only a language model." If the model responds with a fallback response, try increasing the temperature. Things to avoid Avoid relying on models to generate factual information. Use with care on math and logic problems. Generative models under the hood This section aims to answer the question - Is there randomness in generative models' responses, or are they deterministic? The short answer - yes to both. When you prompt a generative model, a text response is generated in two stages. In the first stage, the generative model processes the input prompt and generates a probability distribution over possible tokens (words) that are likely to come next. For example, if you prompt with the input text "The dog jumped over the ... ", the generative model will produce an array of probable next words: [("fence", 0.77), ("ledge", 0.12), ("blanket", 0.03), ...] This process is deterministic; a generative model will produce this same distribution every time it's input the same prompt text. In the second stage, the generative model converts these distributions into actual text responses through one of several decoding strategies. A simple decoding strategy might select the most likely token at every timestep. This process would always be deterministic. However, you could instead choose to generate a response by randomly sampling over the distribution returned by the model. This process would be stochastic (random). Control the degree of randomness allowed in this decoding process by setting the temperature. A temperature of 0 means only the most likely tokens are selected, and there's no randomness. Conversely, a high temperature injects a high degree of randomness into the tokens selected by the model, leading to more unexpected, surprising model responses. Next steps Now that you have a deeper understanding of prompt design, try writing your own prompts using Google AI Studio . To learn about multimodal prompting, see Prompting with media files . To learn \ No newline at end of file diff --git a/docstore/f1f00309-05f5-40c9-8a74-a6a692ee6fa8 b/docstore/f1f00309-05f5-40c9-8a74-a6a692ee6fa8 new file mode 100644 index 0000000000000000000000000000000000000000..a2eb6effe018fcbb28eb80c102d9fbb9beed9d0c --- /dev/null +++ b/docstore/f1f00309-05f5-40c9-8a74-a6a692ee6fa8 @@ -0,0 +1 @@ +50,000 lines of code (with the standard 80 characters per line) All the text messages you have sent in the last 5 years 8 average length English novels Transcripts of over 200 average length podcast episodes The more limited context windows common in many other models often require strategies like arbitrarily dropping old messages, summarizing content, using RAG with vector databases, or filtering prompts to save tokens. While these techniques remain valuable in specific scenarios, Gemini's extensive context window invites a more direct approach: providing all relevant information upfront. Because Gemini models were purpose-built with massive context capabilities, they demonstrate powerful in-context learning. For example, using only in-context instructional materials (a 500-page reference grammar, a dictionary, and ≈400 parallel sentences), Gemini learned to translate from English to Kalamang—a Papuan language with fewer than 200 speakers—with quality similar to a human learner using the same materials. This illustrates the paradigm shift enabled by Gemini's long context, empowering new possibilities through robust in-context learning. Long context use cases While the standard use case for most generative models is still text input, the Gemini model family enables a new paradigm of multimodal use cases. These models can natively understand text, video, audio, and images. They are accompanied by the Gemini API that takes in multimodal file types for convenience. Long form text Text has proved to be the layer of intelligence underpinning much of the momentum around LLMs. As mentioned earlier, much of the practical limitation of LLMs was because of not having a large enough context window to do certain tasks. This led to the rapid adoption of retrieval augmented generation (RAG) and other techniques which dynamically provide the model with relevant contextual information. Now, with larger and larger context windows, there are new techniques becoming available which \ No newline at end of file diff --git a/docstore/f1f460da-d208-48fe-b577-a1415bdadf47 b/docstore/f1f460da-d208-48fe-b577-a1415bdadf47 new file mode 100644 index 0000000000000000000000000000000000000000..cbba2e467340ab9b0e4d492e9dd7f5543deb9e33 --- /dev/null +++ b/docstore/f1f460da-d208-48fe-b577-a1415bdadf47 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/function-calling#parallel_function_calling Title: Function calling with the Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/f20d97ea-d6f6-433f-8e99-8270834571bc b/docstore/f20d97ea-d6f6-433f-8e99-8270834571bc new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/f20d97ea-d6f6-433f-8e99-8270834571bc @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/f20e78b0-b4df-4b95-9a29-d673fd5760bc b/docstore/f20e78b0-b4df-4b95-9a29-d673fd5760bc new file mode 100644 index 0000000000000000000000000000000000000000..b7901babd54dbafa9f145cdbd26424334c4f88eb --- /dev/null +++ b/docstore/f20e78b0-b4df-4b95-9a29-d673fd5760bc @@ -0,0 +1 @@ +upload ( file = doc_data_2 , config = dict ( mime_type = 'application/pdf' ) ) prompt = "What is the difference between each of the main benchmarks between these two papers? Output these in a table." response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ sample_pdf_1 , sample_pdf_2 , prompt ]) print ( response . text ) JavaScript import { createPartFromUri , GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); async function uploadRemotePDF ( url , displayName ) { const pdfBuffer = await fetch ( url ) . then (( response ) = > response . arrayBuffer ()); const fileBlob = new Blob ([ pdfBuffer ], { type : 'application/pdf' }); const file = await ai . files . upload ({ file : fileBlob , config : { displayName : displayName , }, }); // Wait for the file to be processed. let getFile = await ai . files . get ({ name : file . name }); while ( getFile . state === 'PROCESSING' ) { getFile = await ai . files . get ({ name : file . name }); console . log ( `current file status: ${ getFile . state } ` ); console . log ( 'File is still processing, retrying in 5 seconds' ); await new Promise (( resolve ) = > { setTimeout ( resolve , 5000 ); }); } if ( file . state === 'FAILED' ) { throw new Error ( 'File processing failed.' ); } return file ; } async function main () { const content = [ 'What is the difference between each of the main benchmarks between these two papers? Output these in a table.' , ]; let file1 = await uploadRemotePDF ( "https://arxiv.org/pdf/2312.11805" , "PDF 1" ) if ( file1 . uri && file1 . mimeType ) { const fileContent = createPartFromUri ( file1 . uri , file1 . mimeType ); content . push ( fileContent ); } let file2 = await uploadRemotePDF ( "https://arxiv.org/pdf/2403.05530" , "PDF 2" ) if ( file2 . uri && file2 . mimeType ) { const fileContent = createPartFromUri ( file2 . uri , file2 . mimeType ); content . push ( fileContent ); } const response = await ai . models . \ No newline at end of file diff --git a/docstore/f21e5130-4505-4c7c-bb2a-1ba48ff9eee6 b/docstore/f21e5130-4505-4c7c-bb2a-1ba48ff9eee6 new file mode 100644 index 0000000000000000000000000000000000000000..a1daa64b7091fa9e5594d9222d90d8c8ad6521d2 --- /dev/null +++ b/docstore/f21e5130-4505-4c7c-bb2a-1ba48ff9eee6 @@ -0,0 +1 @@ +). You can listen to all the voices in AI Studio . To specify a voice, set the voice name within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "voice_config" : { "prebuilt_voice_config" : { "voice_name" : "Kore" }} }, } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" } } } }; Note: If you're using the generateContent API, the set of available voices is slightly different. See the audio generation guide for generateContent audio generation voices. The Live API supports multiple languages . To change the language, set the language code within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "language_code" : "de-DE" } } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { languageCode : "de-DE" } }; Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Native audio capabilities The following capabilities are only available with native audio. You can learn more about native audio in Choose a model and audio generation . Note: Native audio models currently have limited tool use support. See Overview of supported tools for details. How to use native audio output To use native audio output, configure one of the native audio models and set response_modalities to AUDIO . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-preview-native-audio-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio . live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-preview-native-audio-dialog' ; const config = { responseModalities : [ \ No newline at end of file diff --git a/docstore/f256f536-4fef-4490-a293-17c001129e91 b/docstore/f256f536-4fef-4490-a293-17c001129e91 new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/f256f536-4fef-4490-a293-17c001129e91 @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/f2757eca-f45c-402d-87ed-8723b6bf90f4 b/docstore/f2757eca-f45c-402d-87ed-8723b6bf90f4 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/f2757eca-f45c-402d-87ed-8723b6bf90f4 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/f28f0348-28b6-4c57-8b27-1d2804d595db b/docstore/f28f0348-28b6-4c57-8b27-1d2804d595db new file mode 100644 index 0000000000000000000000000000000000000000..233bee7728dc75275e99f81f4f4da5a0ce71d64a --- /dev/null +++ b/docstore/f28f0348-28b6-4c57-8b27-1d2804d595db @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/thinking#main-content Title: Gemini thinking | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/f298a57f-5b7b-44bf-a7b0-6a2ae6baa2f1 b/docstore/f298a57f-5b7b-44bf-a7b0-6a2ae6baa2f1 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/f298a57f-5b7b-44bf-a7b0-6a2ae6baa2f1 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/f2bfa5b2-a2b4-4282-8c2d-77d4fc36a577 b/docstore/f2bfa5b2-a2b4-4282-8c2d-77d4fc36a577 new file mode 100644 index 0000000000000000000000000000000000000000..8ad00481fa69abaa6f89712b17708b667b3ff7f6 --- /dev/null +++ b/docstore/f2bfa5b2-a2b4-4282-8c2d-77d4fc36a577 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.5-flash-lite Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/f30223a0-dfd0-4b8c-a435-117de7030f58 b/docstore/f30223a0-dfd0-4b8c-a435-117de7030f58 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/f30223a0-dfd0-4b8c-a435-117de7030f58 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/f3208f73-cd73-4883-8ed7-fed1b66ca3dd b/docstore/f3208f73-cd73-4883-8ed7-fed1b66ca3dd new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/f3208f73-cd73-4883-8ed7-fed1b66ca3dd @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/f3253187-8643-4763-8f15-2be9d9b88603 b/docstore/f3253187-8643-4763-8f15-2be9d9b88603 new file mode 100644 index 0000000000000000000000000000000000000000..a8da0e92acb39cd3b0a95b76d425835072520bc1 --- /dev/null +++ b/docstore/f3253187-8643-4763-8f15-2be9d9b88603 @@ -0,0 +1 @@ +Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/docstore/f336e8b8-9ba8-4cce-bb68-96c1163dacd4 b/docstore/f336e8b8-9ba8-4cce-bb68-96c1163dacd4 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/f336e8b8-9ba8-4cce-bb68-96c1163dacd4 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/f3876996-1615-4a97-8b29-614e574afd22 b/docstore/f3876996-1615-4a97-8b29-614e574afd22 new file mode 100644 index 0000000000000000000000000000000000000000..f71ac6c85727e3c520290c703b52e420cb1baa33 --- /dev/null +++ b/docstore/f3876996-1615-4a97-8b29-614e574afd22 @@ -0,0 +1 @@ +(JSONL) file. Each line in this file must be a JSON object containing a user-defined key and a request object, where the request is a valid GenerateContentRequest object. The user-defined key is used in the response to indicate which output is the result of which request. For example, the request with the key defined as request-1 will have its response annotated with the same key name. This file is uploaded using the File API . The maximum allowed file size for an input file is 2GB. The following is an example of a JSONL file. You can save it in a file named my-batch-requests.json : { "key" : "request-1" , "request" : { "contents" : [{ "parts" : [{ "text" : "Describe the process of photosynthesis." }]}], "generation_config" : { "temperature" : 0.7 }}} { "key" : "request-2" , "request" : { "contents" : [{ "parts" : [{ "text" : "What are the main ingredients in a Margherita pizza?" }]}]}} Similarly to inline requests, you can specify other parameters like system instructions, tools or other configurations in each request JSON. You can upload this file using the File API as shown in the following example. If you are working with multimodal input, you can reference other uploaded files within your JSONL file. Python from google import genai from google.genai import types client = genai . Client () # Create a sample JSONL file with open ( "my-batch-requests.jsonl" , "w" ) as f : requests = [ { "key" : "request-1" , "request" : { "contents" : [{ "parts" : [{ "text" : "Describe the process of photosynthesis." }]}]}}, { "key" : "request-2" , "request" : { "contents" : [{ "parts" : [{ "text" : "What are the main ingredients in a Margherita pizza?" }]}]}} ] for req in requests : f . write ( json . dumps ( req ) + " \n " ) # Upload the file to the File API uploaded_file = client . files . upload ( file = 'my-batch-requests.jsonl' , config = types . UploadFileConfig ( display_name = 'my-batch-requests' , mime_type = 'jsonl' ) ) print ( f "Uploaded file: { uploaded_file . name } \ No newline at end of file diff --git a/docstore/f3bfce99-8713-4e26-8d35-6bc749e49f72 b/docstore/f3bfce99-8713-4e26-8d35-6bc749e49f72 new file mode 100644 index 0000000000000000000000000000000000000000..a6b24f2239a055f1d65a9fc8ecf3b6c3213634db --- /dev/null +++ b/docstore/f3bfce99-8713-4e26-8d35-6bc749e49f72 @@ -0,0 +1 @@ +min | 12/2/2023 | 12/4/2023 | | Physics | In progress | 2 hrs | 12/3/2023 | 12/5/2023 | | Biology | Not started | 6 hrs | 11/2/2016 | 12/6/2023 | Prompt Model response Provide a list of all the following attributes: ingredients, type of cuisine, vegetarian or not, in JSON format ```json { "ingredients": [ "rice", "shrimp", "clams", "mussels", "peas", "tomatoes", "onions", "garlic", "olive oil", "paprika", "salt", "pepper" ], "type of cuisine": "Spanish", "vegetarian": false } ``` Put your image first for single-image prompts While Gemini can interpret image and text in any order within a prompt, placing a single image before the text prompt might lead to better results. In the following samples, you'll notice the image comes first before the text in each prompt. Troubleshooting your multimodal prompt You might need to troubleshoot your prompt if you are not getting a helpful response. Here are a few strategies you could try. If the model is not drawing information from the relevant part of the image To get a more specific response, you can point out which aspects of the image you want the prompt to draw information from to inform its response. Prompt Model response How many days will these diapers last a baby? The diapers will last for 198 days before they run out. Updated prompt Improved response How long will these diapers last before I run out? Use the weight shown on the box to determine the child's age, and use the total number of diapers in the box. Divide the total number by how many diapers the child goes through per day. The box contains 198 diapers. The box states the diapers are for a child weighing 22-37 lbs. The average child in this weight range is likely between 1 and 2 years old. A 2-year-old typically goes through 6 diapers per day, therefore, the diapers will last around 33 days (198/6). If the model output is too generic and not tailored enough to the image input To help the model tailor its response to the image(s), try asking it to describe the \ No newline at end of file diff --git a/docstore/f3c7e987-2a6d-4139-ba0b-8c8100fbfab0 b/docstore/f3c7e987-2a6d-4139-ba0b-8c8100fbfab0 new file mode 100644 index 0000000000000000000000000000000000000000..1785bc52e69ea8511733e76440059ca251dc2784 --- /dev/null +++ b/docstore/f3c7e987-2a6d-4139-ba0b-8c8100fbfab0 @@ -0,0 +1 @@ +DISPLAY_NAME_2 = "Gemini_1.5_paper" PROMPT = "What is the difference between each of the main benchmarks between these two papers? Output these in a table." # Function to download and upload a PDF upload_pdf () { local doc_url = " $1 " local display_name = " $2 " # Download the PDF wget -O " ${ display_name } .pdf" " ${ doc_url } " local MIME_TYPE = $( file -b --mime-type " ${ display_name } .pdf" ) local NUM_BYTES = $( wc -c < " ${ display_name } .pdf" ) echo "MIME_TYPE: ${ MIME_TYPE } " echo "NUM_BYTES: ${ NUM_BYTES } " local tmp_header_file = upload-header.tmp # Initial resumable request curl " ${ BASE_URL } /upload/v1beta/files?key= ${ GOOGLE_API_KEY } " \ -D " ${ tmp_header_file } " \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ display_name } '}}" 2 > /dev/null local upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the PDF curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ display_name } .pdf" 2 > /dev/null > "file_info_ ${ display_name } .json" local file_uri = $( jq ".file.uri" "file_info_ ${ display_name } .json" ) echo "file_uri for ${ display_name } : ${ file_uri } " # Clean up the downloaded PDF rm " ${ display_name } .pdf" echo " ${ file_uri } " } # Upload the first PDF file_uri_1 = $( upload_pdf " ${ DOC_URL_1 } " " ${ DISPLAY_NAME_1 } " ) # Upload the second PDF file_uri_2 = $( upload_pdf " ${ DOC_URL_2 } " " ${ DISPLAY_NAME_2 } " ) # Now generate content using both files curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key= $GOOGLE_API_KEY " \ -H 'Content-Type: application/json' \ -X \ No newline at end of file diff --git a/docstore/f3d75240-136a-490a-b995-f7e7f4367b98 b/docstore/f3d75240-136a-490a-b995-f7e7f4367b98 new file mode 100644 index 0000000000000000000000000000000000000000..96036ef0e6317c7c350fd9d242104197fce70d6e --- /dev/null +++ b/docstore/f3d75240-136a-490a-b995-f7e7f4367b98 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/rate-limits#main-content Title: Rate limits | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/f3e05055-bf71-4598-9636-35a73b6c2f56 b/docstore/f3e05055-bf71-4598-9636-35a73b6c2f56 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/f3e05055-bf71-4598-9636-35a73b6c2f56 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/f3fbf46c-516d-476e-8a01-5c1ade7d62db b/docstore/f3fbf46c-516d-476e-8a01-5c1ade7d62db new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/f3fbf46c-516d-476e-8a01-5c1ade7d62db @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/f408f85e-94c8-4a27-8212-622b80a2b18c b/docstore/f408f85e-94c8-4a27-8212-622b80a2b18c new file mode 100644 index 0000000000000000000000000000000000000000..83e8a7f39a569661ceb51609e03cd9ce9f516cda --- /dev/null +++ b/docstore/f408f85e-94c8-4a27-8212-622b80a2b18c @@ -0,0 +1 @@ +will only show up for projects that meet next tier qualifications . After a quick validation, the project will be upgraded to the next tier. Request a rate limit increase Each model variation has an associated rate limit (requests per minute, RPM). For details on those rate limits, see Gemini models . Request paid tier rate limit increase We offer no guarantees about increasing your rate limit, but we'll do our best to review your request and reach out to you if we're able to accommodate your capacity needs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/docstore/f421869c-dfcb-45e3-8d56-3283f87f214b b/docstore/f421869c-dfcb-45e3-8d56-3283f87f214b new file mode 100644 index 0000000000000000000000000000000000000000..12eaa70edcc32ba40cc62f56233fa0adea94b474 --- /dev/null +++ b/docstore/f421869c-dfcb-45e3-8d56-3283f87f214b @@ -0,0 +1 @@ +Gemini API libraries | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini API libraries When building with the Gemini API, we recommend using our official collection of libraries across major languages: the Google GenAI SDK . They are production ready under General Availability . Our samples and documentation across this site are built using these libraries. Note: If you're using one of our legacy libraries, we strongly recommend you migrate to the Google GenAI SDK. Review the legacy libraries section for more information. If you're new to the Gemini API, follow our quickstart guide to get started. Language support and installation The Google GenAI SDK is available for the Python, JavaScript/TypeScript, Go and Java languages. You can install each language's library using package managers, or visit their GitHub repos for further engagement: Python Library: google-genai GitHub Repository: googleapis/python-genai Installation: pip install google-genai JavaScript Library: @google/genai GitHub Repository: googleapis/js-genai Installation: npm install @google/genai Go Library: google.golang.org/genai GitHub Repository: googleapis/go-genai Installation: go get google.golang.org/genai Java Library: google-genai GitHub Repository: googleapis/java-genai Installation: If you're using Maven, add the following to your dependencies: com.google.genai google-genai 1.0.0 General availability We started rolling out the Google GenAI SDK in late 2024. As of May 2025, it reached General Availability (GA) across all supported platforms. This means \ No newline at end of file diff --git a/docstore/f4241c8a-496c-4622-acd1-7a14de4f115f b/docstore/f4241c8a-496c-4622-acd1-7a14de4f115f new file mode 100644 index 0000000000000000000000000000000000000000..68dfcf53eb693dba8358b7fdf6b0010fadcbc966 --- /dev/null +++ b/docstore/f4241c8a-496c-4622-acd1-7a14de4f115f @@ -0,0 +1 @@ +. files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const countTokensResponse = await ai . models . countTokens ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), ]), }); console . log ( countTokensResponse . totalTokens ); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } localAudioPath := "/path/to/sample.mp3" uploadedFile , _ := client . Files . UploadFromPath ( ctx , localAudioPath , nil , ) parts := [] * genai . Part { genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } tokens , _ := client . Models . CountTokens ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Printf ( "File %s is %d tokens\n" , localAudioPath , tokens . TotalTokens ) } Supported audio formats Gemini supports the following audio format MIME types: WAV - audio/wav MP3 - audio/mp3 AIFF - audio/aiff AAC - audio/aac OGG Vorbis - audio/ogg FLAC - audio/flac Technical details about audio Gemini represents each second of audio as 32 tokens; for example, one minute of audio is represented as 1,920 tokens. Gemini can "understand" non-speech components, such as birdsong or sirens. The maximum supported length of audio data in a single prompt is 9.5 hours. Gemini doesn't limit the number of audio files in a single prompt; however, the total combined length of all audio files in a single prompt can't exceed 9.5 hours. Gemini downsamples audio files to a 16 Kbps data resolution. If the audio source contains multiple channels, Gemini combines those channels into a single channel. What's next This guide shows how to generate text in response to audio data. To learn more, see the following resources: File prompting strategies : \ No newline at end of file diff --git a/docstore/f46b852e-d0c4-4794-9c12-6547b8b6df03 b/docstore/f46b852e-d0c4-4794-9c12-6547b8b6df03 new file mode 100644 index 0000000000000000000000000000000000000000..b0033fbc695240330e92f0eacef1e843c48482b9 --- /dev/null +++ b/docstore/f46b852e-d0c4-4794-9c12-6547b8b6df03 @@ -0,0 +1 @@ +Supported values are "16:9" and "9:16" . The default is "16:9" . personGeneration : Allow the model to generate videos of people. The following values are supported: Text-to-video generation: "dont_allow" : Don't allow the inclusion of people or faces. "allow_adult" : Generate videos that include adults, but not children. "allow_all" : Generate videos that include adults and children. Image-to-video generation: "dont_allow" : Don't allow the inclusion of people or faces. "allow_adult" : Generate videos that include adults, but not children. See Limitations . numberOfVideos : Output videos requested, either 1 or 2 . durationSeconds : Length of each output video in seconds, between 5 and 8 . enhance_prompt : Enable or disable the prompt rewriter. Enabled by default. Specifications Modalities Text-to-video generation Image-to-video generation Request latency Min: 11 seconds Max: 6 minutes (during peak hours) Variable length generation 5-8 seconds Resolution 720p Frame rate 24fps Aspect ratio 16:9 - landscape 9:16 - portrait Input languages (text-to-video) English Limitations Image-to-video personGeneration is not allowed in EU, UK, CH, MENA locations Text-to-video personGeneration: "allow_all" is not allowed in EU, UK, CH, MENA locations Note: Check out the Models , Pricing , and Rate limits pages for more usage limitations for Veo. Videos created by Veo are watermarked using SynthID , our tool for watermarking and identifying AI-generated content, and are passed through safety filters and memorization checking processes that help mitigate privacy, copyright and bias risks. Things to try To get the most out of Veo, incorporate video-specific terminology into your prompts. Veo understands a wide range of terms related to: Shot composition: Specify the framing and number of subjects in the shot (e.g., "single shot," "two shot," "over-the-shoulder shot"). Camera positioning and movement: Control the camera's location and movement using terms like "eye level," "high \ No newline at end of file diff --git a/docstore/f482cc03-2625-4af9-98d6-416d6fe6c91a b/docstore/f482cc03-2625-4af9-98d6-416d6fe6c91a new file mode 100644 index 0000000000000000000000000000000000000000..8562c6ca5d2a89dac90935227121a5fd486f1f09 --- /dev/null +++ b/docstore/f482cc03-2625-4af9-98d6-416d6fe6c91a @@ -0,0 +1 @@ +establishing your core idea, and then refine and expand upon that core idea until the generated image is close to your vision. Prompt: A park in the spring next to a lake Prompt: A park in the spring next to a lake, the sun sets across the lake, golden hour Prompt: A park in the spring next to a lake, the sun sets across the lake, golden hour, red wildflowers Imagen models can transform your ideas into detailed images, whether your prompts are short or long and detailed. Refine your vision through iterative prompting, adding details until you achieve the perfect result. Short prompts let you generate an image quickly. Prompt: close-up photo of a woman in her 20s, street photography, movie still, muted orange warm tones Longer prompts let you add specific details and build your image. Prompt: captivating photo of a woman in her 20s utilizing a street photography style. The image should look like a movie still with muted orange warm tones. Additional advice for Imagen prompt writing: Use descriptive language : Employ detailed adjectives and adverbs to paint a clear picture for Imagen. Provide context : If necessary, include background information to aid the AI's understanding. Reference specific artists or styles : If you have a particular aesthetic in mind, referencing specific artists or art movements can be helpful. Use prompt engineering tools : Consider exploring prompt engineering tools or resources to help you refine your prompts and achieve optimal results. Enhancing the facial details in your personal and group images : Specify facial details as a focus of the photo (for example, use the word "portrait" in the prompt). Generate text in images Imagen models can add text into images, opening up more creative image generation possibilities. Use the following guidance to get the most out of this feature: Iterate with confidence : You might have to regenerate images until you achieve the look you want. Imagen's text integration is still evolving, and sometimes \ No newline at end of file diff --git a/docstore/f49dccde-f84e-4cad-b125-3b7f8f4d2f79 b/docstore/f49dccde-f84e-4cad-b125-3b7f8f4d2f79 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/f49dccde-f84e-4cad-b125-3b7f8f4d2f79 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/f4b76f11-0fb1-470a-8130-5bd7de15f957 b/docstore/f4b76f11-0fb1-470a-8130-5bd7de15f957 new file mode 100644 index 0000000000000000000000000000000000000000..8450328bfc49bb62b82cf9cb2ac9ca979a8f4eef --- /dev/null +++ b/docstore/f4b76f11-0fb1-470a-8130-5bd7de15f957 @@ -0,0 +1 @@ +Python import asyncio import wave from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : wf = wave . open ( "audio.wav" , "wb" ) wf . setnchannels ( 1 ) wf . setsampwidth ( 2 ) wf . setframerate ( 24000 ) message = "Hello how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . data is not None : wf . writeframes ( response . data ) # Un-comment this code to print audio data info # if response.server_content.model_turn is not None: # print(response.server_content.model_turn.parts[0].inline_data.mime_type) wf . close () if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : \ No newline at end of file diff --git a/docstore/f4d01ad3-ab8a-478f-8cf0-869259615c90 b/docstore/f4d01ad3-ab8a-478f-8cf0-869259615c90 new file mode 100644 index 0000000000000000000000000000000000000000..7e8ad5dab4abf30afa5f2a7a1bb2bb58fcb2f5d0 --- /dev/null +++ b/docstore/f4d01ad3-ab8a-478f-8cf0-869259615c90 @@ -0,0 +1 @@ +-X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"' " $MIME_TYPE " '", "data": "' " $IMAGE_B64 " '" } }, {"text": "Caption this image."} ] }] }' 2 > /dev/null Note: Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly. Uploading images using the File API For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to generateContent . See the Files API guide for more information and examples. Python from google import genai client = genai . Client () my_file = client . files . upload ( file = "path/to/sample.jpg" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ my_file , "Caption this image." ], ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.jpg" , config : { mimeType : "image/jpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Caption this image." , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.jpg" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Caption this image." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * \ No newline at end of file diff --git a/docstore/f4e9309c-573d-4a79-9722-d8e50ce2eafa b/docstore/f4e9309c-573d-4a79-9722-d8e50ce2eafa new file mode 100644 index 0000000000000000000000000000000000000000..f71ac6c85727e3c520290c703b52e420cb1baa33 --- /dev/null +++ b/docstore/f4e9309c-573d-4a79-9722-d8e50ce2eafa @@ -0,0 +1 @@ +(JSONL) file. Each line in this file must be a JSON object containing a user-defined key and a request object, where the request is a valid GenerateContentRequest object. The user-defined key is used in the response to indicate which output is the result of which request. For example, the request with the key defined as request-1 will have its response annotated with the same key name. This file is uploaded using the File API . The maximum allowed file size for an input file is 2GB. The following is an example of a JSONL file. You can save it in a file named my-batch-requests.json : { "key" : "request-1" , "request" : { "contents" : [{ "parts" : [{ "text" : "Describe the process of photosynthesis." }]}], "generation_config" : { "temperature" : 0.7 }}} { "key" : "request-2" , "request" : { "contents" : [{ "parts" : [{ "text" : "What are the main ingredients in a Margherita pizza?" }]}]}} Similarly to inline requests, you can specify other parameters like system instructions, tools or other configurations in each request JSON. You can upload this file using the File API as shown in the following example. If you are working with multimodal input, you can reference other uploaded files within your JSONL file. Python from google import genai from google.genai import types client = genai . Client () # Create a sample JSONL file with open ( "my-batch-requests.jsonl" , "w" ) as f : requests = [ { "key" : "request-1" , "request" : { "contents" : [{ "parts" : [{ "text" : "Describe the process of photosynthesis." }]}]}}, { "key" : "request-2" , "request" : { "contents" : [{ "parts" : [{ "text" : "What are the main ingredients in a Margherita pizza?" }]}]}} ] for req in requests : f . write ( json . dumps ( req ) + " \n " ) # Upload the file to the File API uploaded_file = client . files . upload ( file = 'my-batch-requests.jsonl' , config = types . UploadFileConfig ( display_name = 'my-batch-requests' , mime_type = 'jsonl' ) ) print ( f "Uploaded file: { uploaded_file . name } \ No newline at end of file diff --git a/docstore/f4f88cce-6386-41ca-b9f8-6efe8a1ec70e b/docstore/f4f88cce-6386-41ca-b9f8-6efe8a1ec70e new file mode 100644 index 0000000000000000000000000000000000000000..b74eb69053b438c297a04f89cccda77252b02b92 --- /dev/null +++ b/docstore/f4f88cce-6386-41ca-b9f8-6efe8a1ec70e @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/live-session Title: Session management with Live API | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/f509561a-3288-4450-8d61-ea02e6025331 b/docstore/f509561a-3288-4450-8d61-ea02e6025331 new file mode 100644 index 0000000000000000000000000000000000000000..deed43be9d78353ae146822eb2d40897035c76a7 --- /dev/null +++ b/docstore/f509561a-3288-4450-8d61-ea02e6025331 @@ -0,0 +1 @@ +"What other color sofas would work in my space? can you update the image?" Multi-turn image editing (chat): Keep generating / editing images conversationally. Example prompts: [upload an image of a blue car.] , "Turn this car into a convertible.", "Now change the color to yellow." Limitations For best performance, use the following languages: EN, es-MX, ja-JP, zh-CN, hi-IN. Image generation does not support audio or video inputs. Image generation may not always trigger: The model may output text only. Try asking for image outputs explicitly (e.g. "generate an image", "provide images as you go along", "update the image"). The model may stop generating partway through. Try again or try a different prompt. When generating text for an image, Gemini works best if you first generate the text and then ask for an image with the text. There are some regions/countries where Image generation is not available. See Models for more information. Generate images using the Imagen models This example demonstrates generating images with an Imagen model : Python from google import genai from google.genai import types from PIL import Image from io import BytesIO client = genai . Client () response = client . models . generate_images ( model = 'imagen-4.0-generate-preview-06-06' , prompt = 'Robot holding a red skateboard' , config = types . GenerateImagesConfig ( number_of_images = 4 , ) ) for generated_image in response . generated_images : generated_image . image . show () JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . generateImages ({ model : 'imagen-4.0-generate-preview-06-06' , prompt : 'Robot holding a red skateboard' , config : { numberOfImages : 4 , }, }); let idx = 1 ; for ( const generatedImage of response . generatedImages ) { let imgBytes = generatedImage . image . imageBytes ; const buffer = Buffer . from ( imgBytes , "base64" ); fs . \ No newline at end of file diff --git a/docstore/f51025ec-e6e3-4787-9a05-f3f9c23d95a1 b/docstore/f51025ec-e6e3-4787-9a05-f3f9c23d95a1 new file mode 100644 index 0000000000000000000000000000000000000000..18b4c9f67696057e7a072a89e87be8efb7e767eb --- /dev/null +++ b/docstore/f51025ec-e6e3-4787-9a05-f3f9c23d95a1 @@ -0,0 +1 @@ +So for instance, instead of creating an application to write an email reply from scratch, you might instead limit it to expanding on an outline or suggesting alternative phrasings. Perform safety testing appropriate to your use case Testing is a key part of building robust and safe applications, but the extent, scope and strategies for testing will vary. For example, a just-for-fun haiku generator is likely to pose less severe risks than, say, an application designed for use by law firms to summarize legal documents and help draft contracts. But the haiku generator may be used by a wider variety of users which means the potential for adversarial attempts or even unintended harmful inputs can be greater. The implementation context also matters. For instance, an application with outputs that are reviewed by human experts prior to any action being taken might be deemed less likely to produce harmful outputs than the identical application without such oversight. It's not uncommon to go through several iterations of making changes and testing before feeling confident that you're ready to launch, even for applications that are relatively low risk. Two kinds of testing are particularly useful for AI applications: Safety benchmarking involves designing safety metrics that reflect the ways your application could be unsafe in the context of how it is likely to get used, then testing how well your application performs on the metrics using evaluation datasets. It's good practice to think about the minimum acceptable levels of safety metrics before testing so that 1) you can evaluate the test results against those expectations and 2) you can gather the evaluation dataset based on the tests that evaluate the metrics you care about most. Advanced tips Beware of over-relying on “off the shelf” approaches as it's likely you'll need to build your own testing datasets using human raters to fully suit your application's context. If you have more than one metric you'll need to decide \ No newline at end of file diff --git a/docstore/f54549f4-8f2c-4db2-b656-590136957e43 b/docstore/f54549f4-8f2c-4db2-b656-590136957e43 new file mode 100644 index 0000000000000000000000000000000000000000..71a40ddf8f5869779c79b3a864f86f411dc17d60 --- /dev/null +++ b/docstore/f54549f4-8f2c-4db2-b656-590136957e43 @@ -0,0 +1 @@ +correct reasoning steps afterward. To disambiguate between those reasons, ask the model to describe what's in the image. In the following example, if the model responds with a snack that seems surprising when paired with tea (e.g. popcorn), you can first troubleshoot to determine whether the model correctly recognized that the image contains tea. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? Describe what's in this image. Another strategy is to ask the model to explain its reasoning. That can help you narrow down which part of the reasoning broke down, if any. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? What's a snack I can make in 1 minute that would go well with this? Please explain why. What's next Try writing your own multimodal prompts using Google AI Studio . For information on using the Gemini Files API for uploading media files and including them in your prompts, see the Vision , Audio , and Document processing guides. For more guidance on prompt design, like tuning sampling parameters, see the Prompt strategies page. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/f5655cba-2f74-4788-a3a2-945a1f585cbf b/docstore/f5655cba-2f74-4788-a3a2-945a1f585cbf new file mode 100644 index 0000000000000000000000000000000000000000..36b0f0f8a4df60acd9dd94249f5fced4282af350 --- /dev/null +++ b/docstore/f5655cba-2f74-4788-a3a2-945a1f585cbf @@ -0,0 +1 @@ +Get started with Live API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Get started with Live API Preview: The Live API is in preview. The Live API enables low-latency, real-time voice and video interactions with Gemini. It processes continuous streams of audio, video, or text to deliver immediate, human-like spoken responses, creating a natural conversational experience for your users. Live API offers a comprehensive set of features such as Voice Activity Detection , tool use and function calling , session management (for managing long running conversations) and ephemeral tokens (for secure client-sided authentication). This page gets you up and running with examples and basic code samples. Example applications Check out the following example applications that illustrate how to use Live API for end-to-end use cases: Live audio starter app on AI Studio, using JavaScript libraries to connect to Live API and stream bidirectional audio through your microphone and speakers. Live API Python cookbook using Pyaudio that connects to Live API. Partner integrations If you prefer a simpler development process, you can use Daily or LiveKit . These are third-party partner platforms that have already integrated the Gemini Live API over the WebRTC protocol to streamline the development of real-time audio and video applications. Before you begin building There are two important decisions to make before you begin building with the Live API: choosing a model and choosing an implementation approach. Choose an audio generation architecture If you're building an audio-based use case, your choice of model determines the audio generation architecture \ No newline at end of file diff --git a/docstore/f5894419-3093-4d92-a87b-0b9d3e67a97b b/docstore/f5894419-3093-4d92-a87b-0b9d3e67a97b new file mode 100644 index 0000000000000000000000000000000000000000..c72243f466a39b8c76a4237da0daa8b99ecbbe13 --- /dev/null +++ b/docstore/f5894419-3093-4d92-a87b-0b9d3e67a97b @@ -0,0 +1 @@ +}, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST # Use a temporary file to hold the base64 encoded image data TEMP_B64 = $( mktemp ) trap 'rm -f "$TEMP_B64"' EXIT base64 $B64FLAGS $IMG_PATH > " $TEMP_B64 " # Use a temporary file to hold the JSON payload TEMP_JSON = $( mktemp ) trap 'rm -f "$TEMP_JSON"' EXIT cat > " $TEMP_JSON " << EOF { "contents" : [ { "parts" : [ { "text" : "Tell me about this instrument" } , { "inline_data" : { "mime_type" : "image/jpeg" , "data" : " $( cat " $TEMP_B64 " ) " } } ] } ] } EOF curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d "@ $TEMP_JSON " Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const imageUrl = 'http://image/url' ; const image = getImageData ( imageUrl ); const payload = { contents : [ { parts : [ { image }, { text : 'Tell me about this instrument' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } function getImageData ( url ) { const blob = UrlFetchApp . fetch ( url ). getBlob (); return { mimeType : blob . getContentType (), data : Utilities . base64Encode ( blob . getBytes ()) }; } \ No newline at end of file diff --git a/docstore/f595fe83-d47c-430d-a676-0bc6d0666118 b/docstore/f595fe83-d47c-430d-a676-0bc6d0666118 new file mode 100644 index 0000000000000000000000000000000000000000..5517c0bf6b0252fb1080acbdd22b2b409d2663d8 --- /dev/null +++ b/docstore/f595fe83-d47c-430d-a676-0bc6d0666118 @@ -0,0 +1 @@ +signatures, function call and result of the function execution to contents function_call_content = response . candidates [ 0 ] . content # Append the model's function call message, which includes thought signatures contents . append ( function_call_content ) contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Step 4: Create user friendly response with function result and call the model again // ...Create a function response part (No change) // Append thought signatures, function call and result of the function execution to contents const function_response_content = response . candidates [ 0 ]. content ; contents . push ( function_response_content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); The following shows what a request returning a thought signature may look like: [{ "contents" : [ { "role" : "user" , "parts" : [ { "text" : "what is the weather in Lake Tahoe?" } ] } , { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : \ No newline at end of file diff --git a/docstore/f598e72c-3a10-41be-835b-1981c2759922 b/docstore/f598e72c-3a10-41be-835b-1981c2759922 new file mode 100644 index 0000000000000000000000000000000000000000..872e6db079e0bc056e68ec493355ac4cd11f274a --- /dev/null +++ b/docstore/f598e72c-3a10-41be-835b-1981c2759922 @@ -0,0 +1 @@ +audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 \ No newline at end of file diff --git a/docstore/f5997453-e548-4116-9e66-55e46edd3d5e b/docstore/f5997453-e548-4116-9e66-55e46edd3d5e new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/f5997453-e548-4116-9e66-55e46edd3d5e @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/f5a45769-47e4-4d7a-b764-02f1e5dd4d47 b/docstore/f5a45769-47e4-4d7a-b764-02f1e5dd4d47 new file mode 100644 index 0000000000000000000000000000000000000000..933572d214295ba3502f9da18dc8703a97691ed7 --- /dev/null +++ b/docstore/f5a45769-47e4-4d7a-b764-02f1e5dd4d47 @@ -0,0 +1 @@ +captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). * Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds, and their general characteristics. * Thesis Statement: State the main point of the essay, focusing on the remarkable adaptations, behaviors, and ecological significance that make hummingbirds extraordinary. ..... (gemini-2.5-flash) Zero-shot vs few-shot prompts You can include examples in the prompt that show the model what getting it right looks like. The model attempts to identify patterns and relationships from the examples and applies them when generating a response. Prompts that contain a few examples are called few-shot prompts, while prompts that provide no examples are called zero-shot prompts. Few-shot prompts are often used to regulate the formatting, phrasing, scoping, or general patterning of model responses. Use specific and varied examples to help the model narrow its focus and generate more accurate results. We recommend to always include few-shot examples in your prompts. Prompts without few-shot examples are likely to be less effective. In fact, you can remove instructions from your prompt if your examples are clear enough in showing the task at hand. The following zero-shot prompt asks the model to choose the best explanation. Prompt: Please choose the best explanation to the question: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Explanation1 is the better explanation because it provides more detail on the process, including how ice crystals combine and grow into snowflakes as they fall through the atmosphere. (gemini-2.5-flash) If your use case requires the \ No newline at end of file diff --git a/docstore/f5c6c44d-802a-4c08-84a0-a600e13549bd b/docstore/f5c6c44d-802a-4c08-84a0-a600e13549bd new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/f5c6c44d-802a-4c08-84a0-a600e13549bd @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/f5d0d265-c97a-4f2e-b610-a76ea429dcaa b/docstore/f5d0d265-c97a-4f2e-b610-a76ea429dcaa new file mode 100644 index 0000000000000000000000000000000000000000..da790334610df2eb1eb6ac9bb30354d4f1eafe8b --- /dev/null +++ b/docstore/f5d0d265-c97a-4f2e-b610-a76ea429dcaa @@ -0,0 +1 @@ +the output you want. Adding these examples can help the model identify the patterns and apply the relationship between the given images and responses to the new example. This is also called "few-shot" learning. In the example below, the initial output is written in sentence form, and also contains the country (Brazil). Suppose you want the output in a different format or style, and you want only the city, not the country. Adding few-shot examples to your prompt can steer the model to respond in the way you want. Prompt Model response Determine the city along with the landmark. The landmark is the Christ the Redeemer statue in Rio de Janeiro, Brazil. Updated prompt Improved response Determine the city along with the landmark. city: Rome, landmark: the Colosseum. city: Beijing, landmark: Forbidden City city: Rio de Janeiro, landmark: Christ the Redeemer statue Break it down step-by-step For complex tasks like the ones that require both visual understanding and reasoning, it can be helpful to split the task into smaller, more straightforward steps. Alternatively, it could also be effective if you directly ask the model to “think step by step” in your prompt. Prompt Model response When will I run out of toilet paper? Soon, you only have 3 rolls left. Updated prompt Improved response 1. First, count how many toilet paper rolls are in this picture. 2. Then, determine how much toilet paper a typical person uses per day. 3. Calculate how long these rolls of toilet paper will last. 1. There are 3 rolls of toilet paper in this picture. 2. A typical person uses about 20 sheets of toilet paper per day. 3. If each roll contains 200 sheets, then each roll will last for about 10 days. Therefore, the 3 rolls will last for about a month. Math problems or other types of word problems are great candidates for asking the model to think step-by-step. Prompt Response What is the 4th term in the sequence? -135 The response from the model is incorrect. Some ways to improve this is to ask \ No newline at end of file diff --git a/docstore/f5ecd119-c008-4be8-87e3-330c22b7ac7b b/docstore/f5ecd119-c008-4be8-87e3-330c22b7ac7b new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/f5ecd119-c008-4be8-87e3-330c22b7ac7b @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/f60e3d81-afbb-4170-bc3d-6675a54b0d40 b/docstore/f60e3d81-afbb-4170-bc3d-6675a54b0d40 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/f60e3d81-afbb-4170-bc3d-6675a54b0d40 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/f61c02ae-1abd-4950-b0af-46da184daec1 b/docstore/f61c02ae-1abd-4950-b0af-46da184daec1 new file mode 100644 index 0000000000000000000000000000000000000000..7c01a8e52c6f77d499351d1468f50bccd1328e77 --- /dev/null +++ b/docstore/f61c02ae-1abd-4950-b0af-46da184daec1 @@ -0,0 +1 @@ +function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); // Combine audio data strings and save as wave file const combinedAudio = turns . reduce (( acc , turn ) = > { if ( turn . data ) { const buffer = Buffer . from ( turn . data , 'base64' ); const intArray = new Int16Array ( buffer . buffer , buffer . byteOffset , buffer . byteLength / Int16Array . BYTES_PER_ELEMENT ); return acc . concat ( Array . from ( intArray )); } return acc ; }, []); const audioBuffer = new Int16Array ( combinedAudio ); const wf = new WaveFile (); wf . fromScratch ( 1 , 24000 , '16' , audioBuffer ); fs . writeFileSync ( 'output.wav' , wf . toBuffer ()); session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Audio formats Audio data in the Live API is always raw, little-endian, 16-bit PCM. Audio output always uses a sample rate of 24kHz. Input audio is natively 16kHz, but the Live API will resample if needed so any sample rate can be sent. To convey the sample rate of input audio, set the MIME type of each audio-containing Blob to a value like audio/pcm;rate=16000 . Audio transcriptions You can enable transcription of the model's audio output by sending output_audio_transcription in the setup config. The transcription language is inferred from the model's response. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ], "output_audio_transcription" : {} } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello? Gemini are you there?" await session . send_client_content ( turns = { \ No newline at end of file diff --git a/docstore/f633a392-46a1-490d-a9aa-4f76685d7de9 b/docstore/f633a392-46a1-490d-a9aa-4f76685d7de9 new file mode 100644 index 0000000000000000000000000000000000000000..a07288c87291962aa181765ea596a2b1afe9ed3f --- /dev/null +++ b/docstore/f633a392-46a1-490d-a9aa-4f76685d7de9 @@ -0,0 +1 @@ +Model Function Calling Parallel Function Calling Compositional Function Calling Gemini 2.5 Pro ✔️ ✔️ ✔️ Gemini 2.5 Flash ✔️ ✔️ ✔️ Gemini 2.5 Flash-Lite ✔️ ✔️ ✔️ Gemini 2.0 Flash ✔️ ✔️ ✔️ Gemini 2.0 Flash-Lite X X X Best practices Function and Parameter Descriptions: Be extremely clear and specific in your descriptions. The model relies on these to choose the correct function and provide appropriate arguments. Naming: Use descriptive function names (without spaces, periods, or dashes). Strong Typing: Use specific types (integer, string, enum) for parameters to reduce errors. If a parameter has a limited set of valid values, use an enum. Tool Selection: While the model can use an arbitrary number of tools, providing too many can increase the risk of selecting an incorrect or suboptimal tool. For best results, aim to provide only the relevant tools for the context or task, ideally keeping the active set to a maximum of 10-20. Consider dynamic tool selection based on conversation context if you have a large total number of tools. Prompt Engineering: Provide context: Tell the model its role (e.g., "You are a helpful weather assistant."). Give instructions: Specify how and when to use functions (e.g., "Don't guess dates; always use a future date for forecasts."). Encourage clarification: Instruct the model to ask clarifying questions if needed. Temperature: Use a low temperature (e.g., 0) for more deterministic and reliable function calls. Validation: If a function call has significant consequences (e.g., placing an order), validate the call with the user before executing it. Error Handling : Implement robust error handling in your functions to gracefully handle unexpected inputs or API failures. Return informative error messages that the model can use to generate helpful responses to the user. Security: Be mindful of security when calling external APIs. Use appropriate authentication and authorization mechanisms. Avoid exposing sensitive data in function calls. Token \ No newline at end of file diff --git a/docstore/f647e794-32fc-4871-989f-599745005f22 b/docstore/f647e794-32fc-4871-989f-599745005f22 new file mode 100644 index 0000000000000000000000000000000000000000..43d235dee43f472c269052714c9705874463b9cd --- /dev/null +++ b/docstore/f647e794-32fc-4871-989f-599745005f22 @@ -0,0 +1 @@ +tools ]) # Define user prompt contents = [ types . Content ( role = "user" , parts = [ types . Part ( text = "Turn the lights down to a romantic level" )] ) ] # Send request with function declarations response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = contents config = config , ) print ( response . candidates [ 0 ] . content . parts [ 0 ] . function_call ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Generation config with function declaration const config = { tools : [{ functionDeclarations : [ setLightValuesFunctionDeclaration ] }] }; // Configure the client const ai = new GoogleGenAI ({}); // Define user prompt const contents = [ { role : 'user' , parts : [{ text : 'Turn the lights down to a romantic level' }] } ]; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( response . functionCalls [ 0 ]); The model then returns a functionCall object in an OpenAPI compatible schema specifying how to call one or more of the declared functions in order to respond to the user's question. Python id = None args = { 'color_temp' : 'warm' , 'brightness' : 25 } name = 'set_light_values' JavaScript { name : 'set_light_values' , args : { brightness : 25 , color_temp : 'warm' } } Step 3: Execute set_light_values function code Extract the function call details from the model's response, parse the arguments , and execute the set_light_values function. Python # Extract tool call details, it may not be in the first part. tool_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call if tool_call . name == "set_light_values" : result = set_light_values ( ** tool_call . args ) print ( f "Function execution result: { result } " ) JavaScript // Extract tool call details const tool_call = response . functionCalls [ 0 ] let result ; if ( tool_call . name === 'set_light_values' ) { result = \ No newline at end of file diff --git a/docstore/f6550450-5a9c-4a2b-8717-335a48c189b7 b/docstore/f6550450-5a9c-4a2b-8717-335a48c189b7 new file mode 100644 index 0000000000000000000000000000000000000000..02470c314e7374f5d8e35e6eac2d2cfdef8fad5a --- /dev/null +++ b/docstore/f6550450-5a9c-4a2b-8717-335a48c189b7 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/live-guide Title: Live API capabilities guide | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/f69830f6-5527-46cf-9a8a-ed353ed1b084 b/docstore/f69830f6-5527-46cf-9a8a-ed353ed1b084 new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/f69830f6-5527-46cf-9a8a-ed353ed1b084 @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/f6a98ba8-7605-46ed-a3c4-d857b9bb7bde b/docstore/f6a98ba8-7605-46ed-a3c4-d857b9bb7bde new file mode 100644 index 0000000000000000000000000000000000000000..3c1b7a7f28e24faa230d65b7608a5733d7c64407 --- /dev/null +++ b/docstore/f6a98ba8-7605-46ed-a3c4-d857b9bb7bde @@ -0,0 +1 @@ +single prompt by including multiple image Part objects in the contents array. These can be a mix of inline data (local files or URLs) and File API references. Python from google import genai from google.genai import types client = genai . Client () # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client . files . upload ( file = image1_path ) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open ( image2_path , 'rb' ) as f : img2_bytes = f . read () # Create the prompt with text and multiple images response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is different between these two images?" , uploaded_file , # Use the uploaded file reference types . Part . from_bytes ( data = img2_bytes , mime_type = 'image/png' ) ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); async function main () { // Upload the first image const image1_path = "path/to/image1.jpg" ; const uploadedFile = await ai . files . upload ({ file : image1_path , config : { mimeType : "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png" ; const base64Image2File = fs . readFileSync ( image2_path , { encoding : "base64" , }); // Create the prompt with text and multiple images const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ "What is different between these two images?" , createPartFromUri ( uploadedFile . uri , uploadedFile . mimeType ), { inlineData : { mimeType : "image/png" , data : base64Image2File , }, }, ]), }); console . log ( response . text ); } await main (); Go // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile , _ := client . Files . UploadFromPath ( ctx , image1Path , nil ) // Prepare the second image as inline \ No newline at end of file diff --git a/docstore/f6b20fb3-d187-488b-abff-91d4e5b072eb b/docstore/f6b20fb3-d187-488b-abff-91d4e5b072eb new file mode 100644 index 0000000000000000000000000000000000000000..93bb58be7a1c23479b00c2f8000473492f3006e4 --- /dev/null +++ b/docstore/f6b20fb3-d187-488b-abff-91d4e5b072eb @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#live-api-2.0 Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/f6cee32f-69a8-4dca-8f8c-81e4795d198d b/docstore/f6cee32f-69a8-4dca-8f8c-81e4795d198d new file mode 100644 index 0000000000000000000000000000000000000000..c553f327a540b7841117b12e24b225cb1fd824fa --- /dev/null +++ b/docstore/f6cee32f-69a8-4dca-8f8c-81e4795d198d @@ -0,0 +1 @@ +Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves \ No newline at end of file diff --git a/docstore/f70474c5-ad89-4d44-93e1-611d7ae53ba5 b/docstore/f70474c5-ad89-4d44-93e1-611d7ae53ba5 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/f70474c5-ad89-4d44-93e1-611d7ae53ba5 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/f72c24d9-60ff-4327-81fe-214cea7dfa77 b/docstore/f72c24d9-60ff-4327-81fe-214cea7dfa77 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/f72c24d9-60ff-4327-81fe-214cea7dfa77 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/f73128db-1dc5-4ef9-8082-4a5941d9ddc3 b/docstore/f73128db-1dc5-4ef9-8082-4a5941d9ddc3 new file mode 100644 index 0000000000000000000000000000000000000000..c1222b1eb00e14a7d2a482f186a5d8fda014fef3 --- /dev/null +++ b/docstore/f73128db-1dc5-4ef9-8082-4a5941d9ddc3 @@ -0,0 +1 @@ +person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own?` ; let thoughts = "" ; let answer = "" ; async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-pro" , contents : prompt , config : { thinkingConfig : { includeThoughts : true , }, }, }); for await ( const chunk of response ) { for ( const part of chunk . candidates [ 0 ]. content . parts ) { if ( ! part . text ) { continue ; } else if ( part . thought ) { if ( ! thoughts ) { console . log ( "Thoughts summary:" ); } console . log ( part . text ); thoughts = thoughts + part . text ; } else { if ( ! answer ) { console . log ( "Answer:" ); } console . log ( part . text ); answer = answer + part . text ; } } } } await main (); Go package main import ( "context" "fmt" "log" "os" "google.golang.org/genai" ) const prompt = ` Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own? ` func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := genai . Text ( prompt ) model := "gemini-2.5-pro" resp := client . Models . GenerateContentStream ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { IncludeThoughts : true , }, }) for chunk := range resp { for _ , part := range chunk . Candidates [ 0 ]. Content . Parts { if len ( part . Text ) == 0 { continue } if part . Thought { fmt . Printf ( "Thought: %s\n" , part . Text ) } else { fmt . Printf ( "Answer: %s\n" , part . Text ) } } } } Thought signatures \ No newline at end of file diff --git a/docstore/f738e210-4f14-4890-9daf-33bd5034a4f3 b/docstore/f738e210-4f14-4890-9daf-33bd5034a4f3 new file mode 100644 index 0000000000000000000000000000000000000000..d464a7e5141c7bcc5fa86ba919979db27614ba5c --- /dev/null +++ b/docstore/f738e210-4f14-4890-9daf-33bd5034a4f3 @@ -0,0 +1 @@ +Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/f73c8369-3842-470b-b0ca-5c9af036517d b/docstore/f73c8369-3842-470b-b0ca-5c9af036517d new file mode 100644 index 0000000000000000000000000000000000000000..109bd0eaeef926ee02f7659dee842724a4ad9423 --- /dev/null +++ b/docstore/f73c8369-3842-470b-b0ca-5c9af036517d @@ -0,0 +1 @@ +the libraries are stable and fully supported for production use. They are actively maintained, provide access to the latest features, and offer the best performance working with Gemini. If you're not using the Google GenAI SDK and using one of our legacy libraries, we strongly recommend you to migrate. Review the legacy libraries section for more information. Legacy libraries and migration If you are using one of our legacy libraries, we recommend that you migrate to the new libraries . The legacy libraries don't provide access to recent features (such as Live API and Veo ) and are on a deprecation path. They will stop receiving updates at the end of September 2025, the feature gaps will grow and potential bugs may no longer get fixed. Each legacy library's support status varies, detailed in the following table: Language Legacy library Support status Recommended library Python google-generativeai All support, including bug fixes, ends end of September 2025. google-genai JavaScript/TypeScript @google/generativeai All support, including bug fixes, ends end of September 2025. @google/genai Go google.golang.org/generative-ai All support, including bug fixes, ends end of September 2025. google.golang.org/genai Dart and Flutter google_generative_ai Not actively maintained Use trusted community or third party libraries, like firebase_ai , or access using REST API Swift generative-ai-swift Not actively maintained Use Gemini in Firebase Android generative-ai-android Not actively maintained Use Gemini in Firebase Note for Java developers: There was no legacy Google-provided Java SDK for the Gemini API, so no migration from a previous Google library is required. You can start directly with the new library in the Language support and installation section. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google \ No newline at end of file diff --git a/docstore/f75dcf20-22aa-433a-a813-f5e4007de906 b/docstore/f75dcf20-22aa-433a-a813-f5e4007de906 new file mode 100644 index 0000000000000000000000000000000000000000..c085d8aece3abc99a010c5a69268bce2397f0e27 --- /dev/null +++ b/docstore/f75dcf20-22aa-433a-a813-f5e4007de906 @@ -0,0 +1 @@ +100mm Macro lens Model: imagen-3.0-generate-002 Motion Use case Lens type Focal lengths Additional details Sports, wildlife (motion) Telephoto zoom 100-400mm Fast shutter speed, Action or movement tracking Using several keywords from the table, Imagen can generate the following motion images: Prompt: a winning touchdown, fast shutter speed, movement tracking Model: imagen-3.0-generate-002 Prompt: A deer running in the forest, fast shutter speed, movement tracking Model: imagen-3.0-generate-002 Wide-angle Use case Lens type Focal lengths Additional details Astronomical, landscape (wide-angle) Wide-angle 10-24mm Long exposure times, sharp focus, long exposure, smooth water or clouds Using several keywords from the table, Imagen can generate the following wide-angle images: Prompt: an expansive mountain range, landscape wide angle 10mm Model: imagen-3.0-generate-002 Prompt: a photo of the moon, astro photography, wide angle 10mm Model: imagen-3.0-generate-002 What's next Check out the Veo guide to learn how to generate videos with the Gemini API. To learn more about Gemini models, see Gemini models and Experimental models . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/f760a725-b2a9-4b8b-83c9-d877c4e4672e b/docstore/f760a725-b2a9-4b8b-83c9-d877c4e4672e new file mode 100644 index 0000000000000000000000000000000000000000..f4dff28679e092f3875b52fe8358f03006200be3 --- /dev/null +++ b/docstore/f760a725-b2a9-4b8b-83c9-d877c4e4672e @@ -0,0 +1 @@ +gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 \ No newline at end of file diff --git a/docstore/f764860c-ac32-4b35-b68e-3486b87b293e b/docstore/f764860c-ac32-4b35-b68e-3486b87b293e new file mode 100644 index 0000000000000000000000000000000000000000..5a2b8a2c7253b87796a50aae4616794fa40b7018 --- /dev/null +++ b/docstore/f764860c-ac32-4b35-b68e-3486b87b293e @@ -0,0 +1 @@ +( 'Opened' ); }, onmessage : function ( message ) { console . debug ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send content... session . close (); } main (); Note: You can only set one modality in the response_modalities field. This means that you can configure the model to respond with either text or audio, but not both in the same session. Interaction modalities The following sections provide examples and supporting context for the different input and output modalities available in Live API. Sending and receiving text Here's how you can send and receive text: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello, how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . text is not None : print ( response . text , end = "" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . \ No newline at end of file diff --git a/docstore/f76ab069-91b2-4d21-9387-faa657b3cbfc b/docstore/f76ab069-91b2-4d21-9387-faa657b3cbfc new file mode 100644 index 0000000000000000000000000000000000000000..b6194c8105fe9f40c0d9a89b00594ca4d33e213c --- /dev/null +++ b/docstore/f76ab069-91b2-4d21-9387-faa657b3cbfc @@ -0,0 +1 @@ +While meeting the stated qualification criteria is generally sufficient for approval, in rare cases an upgrade request may be denied based on other factors identified during the review process. This system helps maintain the security and integrity of the Gemini API platform for all users. Standard API rate limits The following table lists the rate limits for all standard Gemini API calls. Free Tier Model RPM TPM RPD Gemini 2.5 Pro 5 250,000 100 Gemini 2.5 Flash 10 250,000 250 Gemini 2.5 Flash-Lite Preview 06-17 15 250,000 1,000 Gemini 2.5 Flash Preview TTS 3 10,000 15 Gemini 2.5 Pro Preview TTS -- -- -- Gemini 2.0 Flash 15 1,000,000 200 Gemini 2.0 Flash Preview Image Generation 10 200,000 100 Gemini 2.0 Flash-Lite 30 1,000,000 200 Imagen 3 -- -- -- Veo 2 -- -- -- Gemini 1.5 Flash (Deprecated) 15 250,000 50 Gemini 1.5 Flash-8B (Deprecated) 15 250,000 50 Gemini 1.5 Pro (Deprecated) -- -- -- Gemma 3 & 3n 30 15,000 14,400 Gemini Embedding Experimental 03-07 5 -- 100 Tier 1 Model RPM TPM RPD Gemini 2.5 Pro 150 2,000,000 1,000 Gemini 2.5 Flash 1,000 1,000,000 10,000 Gemini 2.5 Flash-Lite Preview 06-17 4,000 4,000,000 -- Gemini 2.5 Flash Preview TTS 10 10,000 100 Gemini 2.5 Pro Preview TTS 10 10,000 50 Gemini 2.0 Flash 2,000 4,000,000 -- Gemini 2.0 Flash Preview Image Generation 1,000 1,000,000 10,000 Gemini 2.0 Flash-Lite 4,000 4,000,000 -- Imagen 4 Standard 10 -- 70 Imagen 4 Ultra 5 -- 30 Imagen 3 20 -- -- Veo 2 2 videos per minute -- 50 videos per day Gemini 1.5 Flash (Deprecated) 2,000 4,000,000 -- Gemini 1.5 Flash-8B (Deprecated) 4,000 4,000,000 -- Gemini 1.5 Pro (Deprecated) 1,000 4,000,000 -- Gemma 3 & 3n 30 15,000 14,400 Gemini Embedding Experimental 03-07 10 -- 1,000 Tier 2 Model RPM TPM RPD Gemini 2.5 Pro 1,000 5,000,000 50,000 Gemini 2.5 Flash 2,000 3,000,000 100,000 Gemini 2.5 Flash-Lite Preview 06-17 10,000 10,000,000 100,000 Gemini 2.5 Flash Preview TTS 1,000 100,000 10,000 Gemini 2.5 Pro Preview TTS 100 25,000 1,000 Gemini 2.0 Flash 10,000 10,000,000 -- \ No newline at end of file diff --git a/docstore/f78eda32-c3b5-487f-8c0e-a3a32868123b b/docstore/f78eda32-c3b5-487f-8c0e-a3a32868123b new file mode 100644 index 0000000000000000000000000000000000000000..f768002e22e546af8fbd249f6201ab1a1006d078 --- /dev/null +++ b/docstore/f78eda32-c3b5-487f-8c0e-a3a32868123b @@ -0,0 +1 @@ +const data = response . candidates ? .[ 0 ] ? . content ? . parts ? .[ 0 ] ? . inlineData ? . data ; const audioBuffer = Buffer . from ( data , 'base64' ); const fileName = 'out.wav' ; await saveWaveFile ( fileName , audioBuffer ); } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -X POST \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts":[{ "text": "Say cheerfully: Have a wonderful day!" }] }], "generationConfig": { "responseModalities": ["AUDIO"], "speechConfig": { "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Kore" } } } }, "model": "gemini-2.5-flash-preview-tts", }' | jq -r '.candidates[0].content.parts[0].inlineData.data' | \ base64 --decode >out.pcm # You may need to install ffmpeg. ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm out.wav Multi-speaker text-to-speech For multi-speaker audio, you'll need a MultiSpeakerVoiceConfig object with each speaker (up to 2) configured as a SpeakerVoiceConfig . You'll need to define each speaker with the same names used in the prompt : Python from google import genai from google.genai import types import wave # Set up the wave file to save the output: def wave_file ( filename , pcm , channels = 1 , rate = 24000 , sample_width = 2 ): with wave . open ( filename , "wb" ) as wf : wf . setnchannels ( channels ) wf . setsampwidth ( sample_width ) wf . setframerate ( rate ) wf . writeframes ( pcm ) client = genai . Client () prompt = """TTS the following conversation between Joe and Jane: Joe: How's it going today Jane? Jane: Not too bad, how about you?""" response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = prompt , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( multi_speaker_voice_config = types . MultiSpeakerVoiceConfig ( speaker_voice_configs = [ types . \ No newline at end of file diff --git a/docstore/f7a3f5b8-6d30-4f97-874e-72a2718183e9 b/docstore/f7a3f5b8-6d30-4f97-874e-72a2718183e9 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/f7a3f5b8-6d30-4f97-874e-72a2718183e9 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/f7a8fb3b-6504-4501-af68-cb9f199ff977 b/docstore/f7a8fb3b-6504-4501-af68-cb9f199ff977 new file mode 100644 index 0000000000000000000000000000000000000000..bdac38d149644927b7d126aee3930efd52a696eb --- /dev/null +++ b/docstore/f7a8fb3b-6504-4501-af68-cb9f199ff977 @@ -0,0 +1 @@ +turn_off_the_lights_schema = { 'name' : 'turn_off_the_lights' } prompt = """ Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? """ tools = [ { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} ] await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Light control schemas const turnOnTheLightsSchema = { name : 'turn_on_the_lights' }; const turnOffTheLightsSchema = { name : 'turn_off_the_lights' }; const prompt = ` Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? ` ; const tools = [ { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } ]; await run ( prompt , tools = tools , modality = "AUDIO" ) Function calling modes The Gemini API lets you control how the model uses the provided tools (function declarations). Specifically, you can set the mode within the. function_calling_config . AUTO (Default) : The model decides whether to generate a natural language response or suggest a function call based on the prompt and context. This is the most flexible mode and recommended for most scenarios. ANY : The model is constrained to always predict a function call and guarantees function schema adherence. If allowed_function_names is not specified, the model can choose from any of the provided function declarations. If allowed_function_names is provided as a list, the model can only choose from the functions in that list. Use this mode when you require a function call response to every prompt (if applicable). NONE : The model is prohibited from making function calls. This is equivalent to sending a request without any function declarations. Use this to temporarily disable function calling without removing your tool definitions. Python from google.genai import types # Configure function calling mode tool_config = types . ToolConfig ( \ No newline at end of file diff --git a/docstore/f7b25009-d0c0-4c88-aa38-e4dc0fea8f15 b/docstore/f7b25009-d0c0-4c88-aa38-e4dc0fea8f15 new file mode 100644 index 0000000000000000000000000000000000000000..b23a8acc5f0d54a573ae6bf2c9ff53a2c6e1da77 --- /dev/null +++ b/docstore/f7b25009-d0c0-4c88-aa38-e4dc0fea8f15 @@ -0,0 +1 @@ +Rate limits | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Rate limits Rate limits regulate the number of requests you can make to the Gemini API within a given timeframe. These limits help maintain fair usage, protect against abuse, and help maintain system performance for all users. How rate limits work Rate limits are usually measured across three dimensions: Requests per minute ( RPM ) Requests per day ( RPD ) Tokens per minute (input) ( TPM ) Your usage is evaluated against each limit, and exceeding any of them will trigger a rate limit error. For example, if your RPM limit is 20, making 21 requests within a minute will result in an error, even if you haven't exceeded your TPM or other limits. Rate limits are applied per project, not per API key. Limits vary depending on the specific model being used, and some limits only apply to specific models. For example, Images per minute, or IPM, is only calculated for models capable of generating images (Imagen 3), but is conceptually similar to TPM. Other models might have a token per day limit (TPD). Rate limits are more restricted for experimental and preview models. Usage tiers Rate limits are tied to the project's usage tier. As your API usage and spending increase, you'll have an option to upgrade to a higher tier with increased rate limits. Tier Qualifications Free Users in eligible countries Tier 1 Billing account linked to the project Tier 2 Total spend: > $250 and at least 30 days since successful payment Tier 3 Total spend: > $1,000 and at least 30 days since successful payment When you request an upgrade, our automated abuse protection system performs additional checks. \ No newline at end of file diff --git a/docstore/f7c4189e-b8d2-428a-9577-54954309c922 b/docstore/f7c4189e-b8d2-428a-9577-54954309c922 new file mode 100644 index 0000000000000000000000000000000000000000..c447d211af2a61edaf07db2bb1e477c2acfdf1c9 --- /dev/null +++ b/docstore/f7c4189e-b8d2-428a-9577-54954309c922 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/openai#main-content Title: OpenAI compatibility | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/f7cfb7a2-9ecc-47cd-9f92-2c05fa2a3e01 b/docstore/f7cfb7a2-9ecc-47cd-9f92-2c05fa2a3e01 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/f7cfb7a2-9ecc-47cd-9f92-2c05fa2a3e01 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/f7cfcba6-cea7-4334-a151-c2d68c526ca2 b/docstore/f7cfcba6-cea7-4334-a151-c2d68c526ca2 new file mode 100644 index 0000000000000000000000000000000000000000..84742c0b7906ca5a168857eb7577a7e191bcdffb --- /dev/null +++ b/docstore/f7cfcba6-cea7-4334-a151-c2d68c526ca2 @@ -0,0 +1 @@ +prompt - urban background, man-made structures, dark, stormy, or threatening atmosphere. Aspect ratios Gemini Veo video generation supports the following two aspect ratios: Aspect ratio Description Widescreen or 16:9 The most common aspect ratio for televisions, monitors, and mobile phone screens (landscape). Use this when you want to capture more of the background, like in scenic landscapes. Portrait or 9:16 Rotated widescreen. This aspect ratio has been popularized by short form video applications, such as Youtube shorts. Use this for portraits or tall objects with strong vertical orientations, such as buildings, trees, waterfall, or buildings. Widescreen This prompt is an example of the widescreen aspect ratio of 16:9. Prompt Generated output Create a video with a tracking drone view of a man driving a red convertible car in Palm Springs, 1970s, warm sunlight, long shadows. Portrait This prompt is an example of the portrait aspect ratio of 9:16. Prompt Generated output Create a video highlighting the smooth motion of a majestic Hawaiian waterfall within a lush rainforest. Focus on realistic water flow, detailed foliage, and natural lighting to convey tranquility. Capture the rushing water, misty atmosphere, and dappled sunlight filtering through the dense canopy. Use smooth, cinematic camera movements to showcase the waterfall and its surroundings. Aim for a peaceful, realistic tone, transporting the viewer to the serene beauty of the Hawaiian rainforest. What's next Gain more experience generating AI videos with the Veo Colab . Check out cool examples using Veo 2 on the Google DeepMind site Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/f7dd7a19-ca69-4cf9-87e6-dcfd2ab513c3 b/docstore/f7dd7a19-ca69-4cf9-87e6-dcfd2ab513c3 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/f7dd7a19-ca69-4cf9-87e6-dcfd2ab513c3 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/f7e84b18-cd4d-47de-a619-b85399fe4e70 b/docstore/f7e84b18-cd4d-47de-a619-b85399fe4e70 new file mode 100644 index 0000000000000000000000000000000000000000..1214a30652d58a2e32ff23cd8ceb69131e880032 --- /dev/null +++ b/docstore/f7e84b18-cd4d-47de-a619-b85399fe4e70 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#imagen-3 Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/f7f167a2-ac9e-4365-a4e2-f2d00ba49955 b/docstore/f7f167a2-ac9e-4365-a4e2-f2d00ba49955 new file mode 100644 index 0000000000000000000000000000000000000000..c6b31d98c2b16f02ca76b6157fd854c0b53727af --- /dev/null +++ b/docstore/f7f167a2-ac9e-4365-a4e2-f2d00ba49955 @@ -0,0 +1 @@ +: [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Thinking with Gemini 2.5 2.5 Flash and Pro models have "thinking" enabled by default to enhance quality, which may take longer to run and increase token usage. When using 2.5 Flash, you can disable thinking by setting the thinking budget to zero. For more details, see the thinking guide . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # Disables thinking ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , config : { thinkingConfig : { thinkingBudget : 0 , // Disables thinking }, } }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "How does AI work?" ), & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : int32 ( 0 ), // Disables thinking }, } ) \ No newline at end of file diff --git a/docstore/f7f5919e-d64e-4d8b-b7fc-8325841136e8 b/docstore/f7f5919e-d64e-4d8b-b7fc-8325841136e8 new file mode 100644 index 0000000000000000000000000000000000000000..719763e5cb8756394c0c2342dc2d56315a9c5b24 --- /dev/null +++ b/docstore/f7f5919e-d64e-4d8b-b7fc-8325841136e8 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models#veo-2 Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/f7fbb544-cc2b-467d-b542-ce83bf0124d6 b/docstore/f7fbb544-cc2b-467d-b542-ce83bf0124d6 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/f7fbb544-cc2b-467d-b542-ce83bf0124d6 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/f80d6b96-91b5-4d5b-8a19-3ecccbc66a8d b/docstore/f80d6b96-91b5-4d5b-8a19-3ecccbc66a8d new file mode 100644 index 0000000000000000000000000000000000000000..0d6f2bf32b918d3f85636fdb1c53070e3d5509f6 --- /dev/null +++ b/docstore/f80d6b96-91b5-4d5b-8a19-3ecccbc66a8d @@ -0,0 +1 @@ +overlay_filename = f " { item [ 'label' ] } _ { i } _overlay.png" mask . save ( os . path . join ( output_dir , mask_filename )) # Create and save overlay composite = Image . alpha_composite ( im . convert ( 'RGBA' ), overlay ) composite . save ( os . path . join ( output_dir , overlay_filename )) print ( f "Saved mask and overlay for { item [ 'label' ] } to { output_dir } " ) # Example usage if __name__ == "__main__" : extract_segmentation_masks ( "path/to/image.png" ) Check the segmentation example in the cookbook guide for a more detailed example. An example segmentation output with objects and segmentation masks Supported image formats Gemini supports the following image format MIME types: PNG - image/png JPEG - image/jpeg WEBP - image/webp HEIC - image/heic HEIF - image/heif Capabilities All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation. Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements. Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities: Gemini 2.0 models are further trained to support enhanced object detection . Gemini 2.5 models are further trained to support enhanced segmentation in addition to object detection . Limitations and key technical information File limit Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request. Token calculation Gemini 1.5 Flash and Gemini 1.5 Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens. Gemini 2.0 Flash and Gemini 2.5 Flash/Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each \ No newline at end of file diff --git a/docstore/f811b783-98cb-44ae-a78e-f614e6c0e6e3 b/docstore/f811b783-98cb-44ae-a78e-f614e6c0e6e3 new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/f811b783-98cb-44ae-a78e-f614e6c0e6e3 @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/f81df487-eee3-4598-a95c-765b96244358 b/docstore/f81df487-eee3-4598-a95c-765b96244358 new file mode 100644 index 0000000000000000000000000000000000000000..7c3d98af9909034f92832ab4dcc3a7220e5c9856 --- /dev/null +++ b/docstore/f81df487-eee3-4598-a95c-765b96244358 @@ -0,0 +1 @@ +operation = await ai . models . generateVideos ({ model : "veo-2.0-generate-001" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , image : { imageBytes : response . generatedImages [ 0 ]. image . imageBytes , // response from Imagen mimeType : "image/png" , }, config : { aspectRatio : "16:9" , numberOfVideos : 2 , }, }); while ( ! operation . done ) { await new Promise (( resolve ) = > setTimeout ( resolve , 10000 )); operation = await ai . operations . getVideosOperation ({ operation : operation , }); } operation . response ? . generatedVideos ? . forEach ( async ( generatedVideo , n ) = > { const resp = await fetch ( ` ${ generatedVideo . video ? . uri } & key=GEMINI_API_KEY` , // append your API key ); const writer = createWriteStream ( `video ${ n } .mp4` ); Readable . fromWeb ( resp . body ). pipe ( writer ); }); } main (); Go image := response . GeneratedImages [ 0 ]. Image videoConfig := & genai . GenerateVideosConfig { AspectRatio : "16:9" , NumberOfVideos : 2 , } operation , _ := client . Models . GenerateVideos ( ctx , "veo-2.0-generate-001" , "A dramatic scene based on the input image" , image , videoConfig , ) for ! operation . Done { time . Sleep ( 20 * time . Second ) operation , _ = client . Operations . GetVideosOperation ( ctx , operation , nil ) } for n , video := range operation . Response . GeneratedVideos { client . Files . Download ( ctx , video . Video , nil ) fname := fmt . Sprintf ( "video_with_image_input_%d.mp4" , n ) _ = os . WriteFile ( fname , video . Video . VideoBytes , 0644 ) } Veo model parameters (Naming conventions vary by programming language.) prompt : The text prompt for the video. When present, the image parameter is optional. image : The image to use as the first frame for the video. When present, the prompt parameter is optional. negativePrompt : Text string that describes anything you want to discourage the model from generating aspectRatio : Changes the aspect ratio of the generated video. \ No newline at end of file diff --git a/docstore/f8888e87-c723-4cb7-864e-ddaa7a23a9c4 b/docstore/f8888e87-c723-4cb7-864e-ddaa7a23a9c4 new file mode 100644 index 0000000000000000000000000000000000000000..1b38806cd93f51329872622740eee5221d7a7d7e --- /dev/null +++ b/docstore/f8888e87-c723-4cb7-864e-ddaa7a23a9c4 @@ -0,0 +1 @@ +18°C." , config = config , ) # Print the final, user-facing response print ( response . text ) Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . I 've set the thermostat to 20°C. JavaScript This example shows how to use JavaScript/TypeScript SDK to do comopositional function calling using a manual execution loop. import { GoogleGenAI , Type } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Example Functions function get_weather_forecast ({ location }) { console . log ( `Tool Call: get_weather_forecast(location= ${ location } )` ); // TODO: Make API call console . log ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ); return { temperature : 25 , unit : "celsius" }; } function set_thermostat_temperature ({ temperature }) { console . log ( `Tool Call: set_thermostat_temperature(temperature= ${ temperature } )` , ); // TODO: Make API call console . log ( "Tool Response: {'status': 'success'}" ); return { status : "success" }; } const toolFunctions = { get_weather_forecast , set_thermostat_temperature , }; const tools = [ { functionDeclarations : [ { name : "get_weather_forecast" , description : "Gets the current weather temperature for a given location." , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , }, }, required : [ "location" ], }, }, { name : "set_thermostat_temperature" , description : "Sets the thermostat to a desired temperature." , parameters : { type : Type . OBJECT , properties : { temperature : { type : Type . NUMBER , }, }, required : [ \ No newline at end of file diff --git a/docstore/f8cdba13-c114-419c-b63e-05cec7aa5fe1 b/docstore/f8cdba13-c114-419c-b63e-05cec7aa5fe1 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/f8cdba13-c114-419c-b63e-05cec7aa5fe1 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/f905843f-b894-4abf-a8f0-ec747d7fc3c3 b/docstore/f905843f-b894-4abf-a8f0-ec747d7fc3c3 new file mode 100644 index 0000000000000000000000000000000000000000..a8bbf59ddedb6c7213df1bd811fe6d2ce8316cfa --- /dev/null +++ b/docstore/f905843f-b894-4abf-a8f0-ec747d7fc3c3 @@ -0,0 +1 @@ +. live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-exp-native-audio-thinking-dialog' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Voice Activity Detection (VAD) Voice Activity Detection (VAD) allows the model to recognize when a person is speaking. This is essential for creating natural conversations, as it allows a user to interrupt the model at any time. When VAD detects an interruption, the ongoing generation is canceled and discarded. Only the information already sent to the client is retained in the session history. The server then sends a BidiGenerateContentServerContent message to report the interruption. The Gemini server then discards any pending function calls and sends a BidiGenerateContentServerContent message with the IDs of the canceled calls. Python async for response in session . receive (): if response . server_content . interrupted is True : # The generation was interrupted # If realtime playback is implemented in your application, # you should stop playing audio and clear queued playback here. JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . interrupted ) { // The generation was interrupted // If realtime playback is implemented in your application, // you should stop playing audio and clear queued playback here. } } Automatic VAD By default, the model automatically performs VAD on a continuous audio input stream. VAD can be configured with the realtimeInputConfig.automaticActivityDetection field of the setup configuration . When the audio stream is paused for more than a second (for example, because the user switched off the microphone), an audioStreamEnd event \ No newline at end of file diff --git a/docstore/f90f6321-a9eb-42c3-959b-1b7e6bee1778 b/docstore/f90f6321-a9eb-42c3-959b-1b7e6bee1778 new file mode 100644 index 0000000000000000000000000000000000000000..8ae6e7f8faf81825a858a0fb6935f6e2a3e8dc09 --- /dev/null +++ b/docstore/f90f6321-a9eb-42c3-959b-1b7e6bee1778 @@ -0,0 +1 @@ +help with that, as I'm only a language model." If the model responds with a fallback response, try increasing the temperature. Things to avoid Avoid relying on models to generate factual information. Use with care on math and logic problems. Generative models under the hood This section aims to answer the question - Is there randomness in generative models' responses, or are they deterministic? The short answer - yes to both. When you prompt a generative model, a text response is generated in two stages. In the first stage, the generative model processes the input prompt and generates a probability distribution over possible tokens (words) that are likely to come next. For example, if you prompt with the input text "The dog jumped over the ... ", the generative model will produce an array of probable next words: [("fence", 0.77), ("ledge", 0.12), ("blanket", 0.03), ...] This process is deterministic; a generative model will produce this same distribution every time it's input the same prompt text. In the second stage, the generative model converts these distributions into actual text responses through one of several decoding strategies. A simple decoding strategy might select the most likely token at every timestep. This process would always be deterministic. However, you could instead choose to generate a response by randomly sampling over the distribution returned by the model. This process would be stochastic (random). Control the degree of randomness allowed in this decoding process by setting the temperature. A temperature of 0 means only the most likely tokens are selected, and there's no randomness. Conversely, a high temperature injects a high degree of randomness into the tokens selected by the model, leading to more unexpected, surprising model responses. Next steps Now that you have a deeper understanding of prompt design, try writing your own prompts using Google AI Studio . To learn about multimodal prompting, see Prompting with media files . To learn \ No newline at end of file diff --git a/docstore/f914fefb-bf28-4855-880b-792ec98fa206 b/docstore/f914fefb-bf28-4855-880b-792ec98fa206 new file mode 100644 index 0000000000000000000000000000000000000000..b58c189db2b124c61872134bdd4b3d786a4e18e6 --- /dev/null +++ b/docstore/f914fefb-bf28-4855-880b-792ec98fa206 @@ -0,0 +1 @@ +you can read about configuring function calling . Python from google import genai from google.genai import types # Configure the client and tools client = genai . Client () house_tools = [ types . Tool ( function_declarations = [ power_disco_ball , start_music , dim_lights ]) ] config = types . GenerateContentConfig ( tools = house_tools , automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ), # Force the model to call 'any' function, instead of chatting. tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = 'ANY' ) ), ) chat = client . chats . create ( model = "gemini-2.5-flash" , config = config ) response = chat . send_message ( "Turn this place into a party!" ) # Print out each of the function calls requested from this single call print ( "Example 1: Forced function calling" ) for fn in response . function_calls : args = ", " . join ( f " { key } = { val } " for key , val in fn . args . items ()) print ( f " { fn . name } ( { args } )" ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Set up function declarations const houseFns = [ powerDiscoBall , startMusic , dimLights ]; const config = { tools : [{ functionDeclarations : houseFns }], // Force the model to call 'any' function, instead of chatting. toolConfig : { functionCallingConfig : { mode : 'any' } } }; // Configure the client const ai = new GoogleGenAI ({}); // Create a chat session const chat = ai . chats . create ({ model : 'gemini-2.5-flash' , config : config }); const response = await chat . sendMessage ({ message : 'Turn this place into a party!' }); // Print out each of the function calls requested from this single call console . log ( "Example 1: Forced function calling" ); for ( const fn of response . functionCalls ) { const args = Object . entries ( fn . args ) . map (([ key , val ]) = > ` ${ key } = ${ val } ` ) . join ( ', ' ); console . log ( ` ${ fn . name } ( ${ args } )` ); } Each of the printed \ No newline at end of file diff --git a/docstore/f932726b-4bef-4fe3-a048-2c4a5cd15dc8 b/docstore/f932726b-4bef-4fe3-a048-2c4a5cd15dc8 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/f932726b-4bef-4fe3-a048-2c4a5cd15dc8 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/f93a1fe4-c2cd-4b39-92f3-c29c0bfd3a64 b/docstore/f93a1fe4-c2cd-4b39-92f3-c29c0bfd3a64 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/f93a1fe4-c2cd-4b39-92f3-c29c0bfd3a64 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/f9426498-4cda-4ef8-a27b-f3929c3a397b b/docstore/f9426498-4cda-4ef8-a27b-f3929c3a397b new file mode 100644 index 0000000000000000000000000000000000000000..3c1b7a7f28e24faa230d65b7608a5733d7c64407 --- /dev/null +++ b/docstore/f9426498-4cda-4ef8-a27b-f3929c3a397b @@ -0,0 +1 @@ +single prompt by including multiple image Part objects in the contents array. These can be a mix of inline data (local files or URLs) and File API references. Python from google import genai from google.genai import types client = genai . Client () # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client . files . upload ( file = image1_path ) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open ( image2_path , 'rb' ) as f : img2_bytes = f . read () # Create the prompt with text and multiple images response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is different between these two images?" , uploaded_file , # Use the uploaded file reference types . Part . from_bytes ( data = img2_bytes , mime_type = 'image/png' ) ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); async function main () { // Upload the first image const image1_path = "path/to/image1.jpg" ; const uploadedFile = await ai . files . upload ({ file : image1_path , config : { mimeType : "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png" ; const base64Image2File = fs . readFileSync ( image2_path , { encoding : "base64" , }); // Create the prompt with text and multiple images const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ "What is different between these two images?" , createPartFromUri ( uploadedFile . uri , uploadedFile . mimeType ), { inlineData : { mimeType : "image/png" , data : base64Image2File , }, }, ]), }); console . log ( response . text ); } await main (); Go // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile , _ := client . Files . UploadFromPath ( ctx , image1Path , nil ) // Prepare the second image as inline \ No newline at end of file diff --git a/docstore/f94ab70b-8aa0-47c9-9058-4aa7c9f1091a b/docstore/f94ab70b-8aa0-47c9-9058-4aa7c9f1091a new file mode 100644 index 0000000000000000000000000000000000000000..b58c189db2b124c61872134bdd4b3d786a4e18e6 --- /dev/null +++ b/docstore/f94ab70b-8aa0-47c9-9058-4aa7c9f1091a @@ -0,0 +1 @@ +you can read about configuring function calling . Python from google import genai from google.genai import types # Configure the client and tools client = genai . Client () house_tools = [ types . Tool ( function_declarations = [ power_disco_ball , start_music , dim_lights ]) ] config = types . GenerateContentConfig ( tools = house_tools , automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ), # Force the model to call 'any' function, instead of chatting. tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = 'ANY' ) ), ) chat = client . chats . create ( model = "gemini-2.5-flash" , config = config ) response = chat . send_message ( "Turn this place into a party!" ) # Print out each of the function calls requested from this single call print ( "Example 1: Forced function calling" ) for fn in response . function_calls : args = ", " . join ( f " { key } = { val } " for key , val in fn . args . items ()) print ( f " { fn . name } ( { args } )" ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Set up function declarations const houseFns = [ powerDiscoBall , startMusic , dimLights ]; const config = { tools : [{ functionDeclarations : houseFns }], // Force the model to call 'any' function, instead of chatting. toolConfig : { functionCallingConfig : { mode : 'any' } } }; // Configure the client const ai = new GoogleGenAI ({}); // Create a chat session const chat = ai . chats . create ({ model : 'gemini-2.5-flash' , config : config }); const response = await chat . sendMessage ({ message : 'Turn this place into a party!' }); // Print out each of the function calls requested from this single call console . log ( "Example 1: Forced function calling" ); for ( const fn of response . functionCalls ) { const args = Object . entries ( fn . args ) . map (([ key , val ]) = > ` ${ key } = ${ val } ` ) . join ( ', ' ); console . log ( ` ${ fn . name } ( ${ args } )` ); } Each of the printed \ No newline at end of file diff --git a/docstore/f95e5ab6-ceec-4be4-89d6-b65b932520b7 b/docstore/f95e5ab6-ceec-4be4-89d6-b65b932520b7 new file mode 100644 index 0000000000000000000000000000000000000000..2b6e55e3ae415c04ff420e9e56413156ffa5e0fd --- /dev/null +++ b/docstore/f95e5ab6-ceec-4be4-89d6-b65b932520b7 @@ -0,0 +1 @@ +a sample rate of 24kHz. Python # Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav # Install helpers for converting files: pip install librosa soundfile import asyncio import io from pathlib import Path import wave from google import genai from google.genai import types import soundfile as sf import librosa client = genai . Client () # Half cascade model: # model = "gemini-live-2.5-flash-preview" # Native audio output model: model = "gemini-2.5-flash-preview-native-audio-dialog" config = { "response_modalities" : [ "AUDIO" ], "system_instruction" : "You are a helpful assistant and answer in a friendly tone." , } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : buffer = io . BytesIO () y , sr = librosa . load ( "sample.wav" , sr = 16000 ) sf . write ( buffer , y , sr , format = 'RAW' , subtype = 'PCM_16' ) buffer . seek ( 0 ) audio_bytes = buffer . read () # If already in correct format, you can use this: # audio_bytes = Path("sample.pcm").read_bytes() await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) wf = wave . open ( "audio.wav" , "wb" ) wf . setnchannels ( 1 ) wf . setsampwidth ( 2 ) wf . setframerate ( 24000 ) # Output is 24kHz async for response in session . receive (): if response . data is not None : wf . writeframes ( response . data ) # Un-comment this code to print audio data info # if response.server_content.model_turn is not None: # print(response.server_content.model_turn.parts[0].inline_data.mime_type) wf . close () if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; // npm install wavefile const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); // WARNING: Do not use API keys in \ No newline at end of file diff --git a/docstore/f9803079-4ad2-4ed4-8c22-496f5f240e45 b/docstore/f9803079-4ad2-4ed4-8c22-496f5f240e45 new file mode 100644 index 0000000000000000000000000000000000000000..3211ac16b1bc5cf788a258a14d07448add8397f9 --- /dev/null +++ b/docstore/f9803079-4ad2-4ed4-8c22-496f5f240e45 @@ -0,0 +1 @@ +{"file_data":{"mime_type": "application/pdf", "file_uri": ' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json # Clean up the downloaded PDF rm " ${ DISPLAY_NAME } .pdf" Large PDFs stored locally Python from google import genai from google.genai import types import pathlib import httpx client = genai . Client () # Retrieve and encode the PDF byte file_path = pathlib . Path ( 'large_file.pdf' ) # Upload the PDF using the File API sample_file = client . files . upload ( file = file_path , ) prompt = "Summarize this document" response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ sample_file , "Summarize this document" ]) print ( response . text ) JavaScript import { createPartFromUri , GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); async function main () { const file = await ai . files . upload ({ file : 'path-to-localfile.pdf' config : { displayName : 'A17_FlightPlan.pdf' , }, }); // Wait for the file to be processed. let getFile = await ai . files . get ({ name : file . name }); while ( getFile . state === 'PROCESSING' ) { getFile = await ai . files . get ({ name : file . name }); console . log ( `current file status: ${ getFile . state } ` ); console . log ( 'File is still processing, retrying in 5 seconds' ); await new Promise (( resolve ) = > { setTimeout ( resolve , 5000 ); }); } if ( file . state === 'FAILED' ) { throw new Error ( 'File processing failed.' ); } // Add the file to the contents. const content = [ 'Summarize this document' , ]; if ( file . uri && file . mimeType ) { const fileContent = createPartFromUri ( file . uri , file . mimeType ); content . push ( fileContent ); } const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : content , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "os" \ No newline at end of file diff --git a/docstore/f9961dce-a886-47c4-b804-8259164b4068 b/docstore/f9961dce-a886-47c4-b804-8259164b4068 new file mode 100644 index 0000000000000000000000000000000000000000..2551e2e0fc3bc2e016eedb8aba0387799f9e2616 --- /dev/null +++ b/docstore/f9961dce-a886-47c4-b804-8259164b4068 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/ephemeral-tokens#main-content Title: Ephemeral tokens | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/f9a2d262-5f92-465f-befa-55e6d6bcf87c b/docstore/f9a2d262-5f92-465f-befa-55e6d6bcf87c new file mode 100644 index 0000000000000000000000000000000000000000..ed09bf86b4b3896290a2372bddef4006c085c60d --- /dev/null +++ b/docstore/f9a2d262-5f92-465f-befa-55e6d6bcf87c @@ -0,0 +1 @@ +Image generation | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image generation You can generate images using the Gemini API with either Gemini's built-in multimodal capabilities or Imagen, Google's specialized image generation models. For most use cases, start with Gemini . Choose Imagen for specialized tasks where image quality is critical. See Choosing the right model section for more guidance. All generated images include a SynthID watermark . Before you begin Ensure you use a supported model and version for image generation: For Gemini , use Gemini 2.0 Flash Preview Image Generation. For Imagen , use one of the Imagen models (Imagen 3, Imagen 4 or Imagen 4 Ultra). Note that those models are only available on the Paid tier . You can access both Gemini and Imagen models using the same libraries. Note: Image generation may not be available in all regions and countries, review our Models page for more information. Generate images using Gemini Gemini can generate and process images conversationally. You can prompt Gemini with text, images, or a combination of both to achieve various image-related tasks, such as image generation and editing. You must include responseModalities : ["TEXT", "IMAGE"] in your configuration. Image-only output is not supported with these models. Image generation (text-to-image) The following code demonstrates how to generate an image based on a descriptive prompt: Python from google import genai from google.genai import types from PIL import Image from io import BytesIO import base64 client = genai . Client () contents = ( 'Hi, can you create a 3d rendered image of a pig ' 'with wings and a top hat flying \ No newline at end of file diff --git a/docstore/f9aa159f-797c-4ece-80e9-9bf057ba0b2e b/docstore/f9aa159f-797c-4ece-80e9-9bf057ba0b2e new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/f9aa159f-797c-4ece-80e9-9bf057ba0b2e @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/f9bfb0d3-ab86-48bf-b82e-6d46a5047732 b/docstore/f9bfb0d3-ab86-48bf-b82e-6d46a5047732 new file mode 100644 index 0000000000000000000000000000000000000000..70023e78d7b8c9459310371555beec5ba6328e28 --- /dev/null +++ b/docstore/f9bfb0d3-ab86-48bf-b82e-6d46a5047732 @@ -0,0 +1 @@ +results reflects a single function call that the model has requested. To send the results back, include the responses in the same order as they were requested. The Python SDK supports automatic function calling , which automatically converts Python functions to declarations, handles the function call execution and response cycle for you. Following is an example for the disco use case. Note: Automatic Function Calling is a Python SDK only feature at the moment. Python from google import genai from google.genai import types # Actual function implementations def power_disco_ball_impl ( power : bool ) - > dict : """Powers the spinning disco ball. Args: power: Whether to turn the disco ball on or off. Returns: A status dictionary indicating the current state. """ return { "status" : f "Disco ball powered { 'on' if power else 'off' } " } def start_music_impl ( energetic : bool , loud : bool ) - > dict : """Play some music matching the specified parameters. Args: energetic: Whether the music is energetic or not. loud: Whether the music is loud or not. Returns: A dictionary containing the music settings. """ music_type = "energetic" if energetic else "chill" volume = "loud" if loud else "quiet" return { "music_type" : music_type , "volume" : volume } def dim_lights_impl ( brightness : float ) - > dict : """Dim the lights. Args: brightness: The brightness of the lights, 0.0 is off, 1.0 is full. Returns: A dictionary containing the new brightness setting. """ return { "brightness" : brightness } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ power_disco_ball_impl , start_music_impl , dim_lights_impl ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Do everything you need to this place into party!" , config = config , ) print ( " \n Example 2: Automatic function calling" ) print ( response . text ) # I've turned on the disco ball, started playing loud and \ No newline at end of file diff --git a/docstore/f9c285fa-b4f3-4372-99fb-2a8c88cd77c9 b/docstore/f9c285fa-b4f3-4372-99fb-2a8c88cd77c9 new file mode 100644 index 0000000000000000000000000000000000000000..e2c6664b66e7e71d90be47f1cd332d4ab0155ce2 --- /dev/null +++ b/docstore/f9c285fa-b4f3-4372-99fb-2a8c88cd77c9 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#rate-limits Title: Gemini models | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/f9e0906b-d685-4ca7-80d1-b8bd9b193d52 b/docstore/f9e0906b-d685-4ca7-80d1-b8bd9b193d52 new file mode 100644 index 0000000000000000000000000000000000000000..73b4d10fa85a5cdaed2f1e90e6509f07166441b9 --- /dev/null +++ b/docstore/f9e0906b-d685-4ca7-80d1-b8bd9b193d52 @@ -0,0 +1 @@ +https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'Explain how AI works in a few words' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works in a few words" } ] } ] }' "Thinking" is on by default on many of our code samples Many code samples on this site use the Gemini 2.5 Flash model, which has the "thinking" feature enabled by default to enhance response quality. You should be aware that this may increase response time and token usage. If you prioritize speed or wish to minimize costs, you can disable this feature by setting the thinking budget to zero, as shown in the examples below. For more details, see the thinking guide . Note: Thinking is only available on Gemini 2.5 series models and can't be disabled on Gemini 2.5 Pro. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Explain how AI works in a few words" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # Disables thinking ), ) \ No newline at end of file diff --git a/docstore/f9e0b359-d88d-4650-a400-be8f87c0ebf6 b/docstore/f9e0b359-d88d-4650-a400-be8f87c0ebf6 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/f9e0b359-d88d-4650-a400-be8f87c0ebf6 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/f9e46637-8ebe-4c9b-972e-2ff2bceed4f7 b/docstore/f9e46637-8ebe-4c9b-972e-2ff2bceed4f7 new file mode 100644 index 0000000000000000000000000000000000000000..6a8a77c3ec0b2f12317f225d20ed3ea5b03e9f67 --- /dev/null +++ b/docstore/f9e46637-8ebe-4c9b-972e-2ff2bceed4f7 @@ -0,0 +1 @@ +JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); const prompt = "Write a story about a magic backpack." ; const result = await model . generateContentStream ( prompt ); // Print text as it comes in. for await ( const chunk of result . stream ) { const chunkText = chunk . text (); process . stdout . write ( chunkText ); } Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( "gemini-1.5-flash" ) iter := model . GenerateContentStream ( ctx , genai . Text ( "Write a story about a magic backpack." )) for { resp , err := iter . Next () if err == iterator . Done { break } if err != nil { log . Fatal ( err ) } printResponse ( resp ) // utility for printing the response } After Python from google import genai client = genai . Client () for chunk in client . models . generate_content_stream ( model = 'gemini-2.0-flash' , contents = 'Tell me a story in 300 words.' ): print ( chunk . text ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContentStream ({ model : "gemini-2.0-flash" , contents : "Write a story about a magic backpack." , }); let text = "" ; for await ( const chunk of response ) { console . log ( chunk . text ); text += chunk . text ; } Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } for result , err := range client . Models . GenerateContentStream ( ctx , "gemini-2.0-flash" , genai . Text ( "Write a story about a magic backpack." ), nil , ) { if err != nil { log . Fatal ( err ) } fmt . Print ( result . Candidates [ 0 ]. Content . Parts [ 0 ]. Text ) } Configuration \ No newline at end of file diff --git a/docstore/f9f2b138-7213-4834-9932-cbf6a18a92e8 b/docstore/f9f2b138-7213-4834-9932-cbf6a18a92e8 new file mode 100644 index 0000000000000000000000000000000000000000..e0e57032aea14dffde2bf49bc8a0cf4aa3c0a406 --- /dev/null +++ b/docstore/f9f2b138-7213-4834-9932-cbf6a18a92e8 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/sdks Title: Gemini API libraries | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/fa111193-0130-4452-8361-df89e5b93d8c b/docstore/fa111193-0130-4452-8361-df89e5b93d8c new file mode 100644 index 0000000000000000000000000000000000000000..846f589921f766089772715bc1a3853935a191ce --- /dev/null +++ b/docstore/fa111193-0130-4452-8361-df89e5b93d8c @@ -0,0 +1 @@ +batch_status.json ) if [[ $batch_state = "JOB_STATE_SUCCEEDED" ]] ; then if [[ $( jq '.response | has("inlinedResponses")' batch_status.json ) = "true" ]] ; then jq -r '.response.inlinedResponses' batch_status.json exit fi responses_file_name = $( jq -r '.response.responsesFile' batch_status.json ) curl https://generativelanguage.googleapis.com/download/v1beta/ $responses_file_name :download?alt = media \ -H "x-goog-api-key: $GEMINI_API_KEY " 2 > /dev/null elif [[ $batch_state = "JOB_STATE_FAILED" ]] ; then jq '.error' batch_status.json elif [[ $batch_state == "JOB_STATE_CANCELLED" ]] ; then echo "Batch was cancelled by the user" fi Cancelling a batch job You can cancel an ongoing batch job using its name. When a job is canceled, it stops processing new requests. Python # Cancel a batch job client . batches . cancel ( name = batch_job_to_cancel . name ) REST BATCH_NAME = "batches/123456" # Your batch job name # Cancel the batch curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME :cancel \ -H "x-goog-api-key: $GEMINI_API_KEY " \ # Confirm that the status of the batch after cancellation is JOB_STATE_CANCELLED curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type:application/json" 2 > /dev/null | jq -r '.metadata.state' Deleting a batch job You can delete an existing batch job using its name. When a job is deleted, it stops processing new requests and is removed from the list of batch jobs. Python # Delete a batch job client . batches . delete ( name = batch_job_to_delete . name ) REST BATCH_NAME = "batches/123456" # Your batch job name # Cancel the batch curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME :delete \ -H "x-goog-api-key: $GEMINI_API_KEY " \ Technical details Supported models: Batch Mode supports a range of Gemini models. Refer to the Models page for the latest list of compatible models. The supported modalities for Batch Mode are the same as what's \ No newline at end of file diff --git a/docstore/fa162168-92bb-4661-9f79-45b0f487312e b/docstore/fa162168-92bb-4661-9f79-45b0f487312e new file mode 100644 index 0000000000000000000000000000000000000000..58a0a85d29a664bbdbdb8d3a93c58460ef8ef554 --- /dev/null +++ b/docstore/fa162168-92bb-4661-9f79-45b0f487312e @@ -0,0 +1 @@ += []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); And here is a text-to-audio example. You can receive audio by setting AUDIO as response modality. This example saves the received data as WAV file: \ No newline at end of file diff --git a/docstore/fa199913-f780-4ff7-b255-758bfdacbd2f b/docstore/fa199913-f780-4ff7-b255-758bfdacbd2f new file mode 100644 index 0000000000000000000000000000000000000000..0d39ec227c9e73c3d3b5117917b80f593b500b4b --- /dev/null +++ b/docstore/fa199913-f780-4ff7-b255-758bfdacbd2f @@ -0,0 +1 @@ +async function main () { const ai = new GoogleGenAI ({}); const imageUrl = "https://goo.gle/instrument-img" ; const response = await fetch ( imageUrl ); const imageArrayBuffer = await response . arrayBuffer (); const base64ImageData = Buffer . from ( imageArrayBuffer ). toString ( 'base64' ); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ { inlineData : { mimeType : 'image/jpeg' , data : base64ImageData , }, }, { text : "Caption this image." } ], }); console . log ( result . text ); } main (); Go package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Download the image. imageResp , _ := http . Get ( "https://goo.gle/instrument-img" ) imageBytes , _ := io . ReadAll ( imageResp . Body ) parts := [] * genai . Part { genai . NewPartFromBytes ( imageBytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMG_URL = "https://goo.gle/instrument-img" MIME_TYPE = $( curl -sIL " $IMG_URL " | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1 ) if [[ -z " $MIME_TYPE " || ! " $MIME_TYPE " == image/* ]] ; then MIME_TYPE = "image/jpeg" fi # Check for macOS if [[ " $( uname ) " == "Darwin" ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -b 0 ) elif [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 ) else IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -w0 ) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ \ No newline at end of file diff --git a/docstore/fa1a8536-d4cc-4489-83d1-bc760e6327c2 b/docstore/fa1a8536-d4cc-4489-83d1-bc760e6327c2 new file mode 100644 index 0000000000000000000000000000000000000000..bbc4019685cdf16085ca79e5df30b3ebeb71657f --- /dev/null +++ b/docstore/fa1a8536-d4cc-4489-83d1-bc760e6327c2 @@ -0,0 +1 @@ +"role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . server_content . model_turn : print ( "Model turn:" , response . server_content . model_turn ) if response . server_content . output_transcription : print ( "Transcript:" , response . server_content . output_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ], outputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . debug ( 'Received output transcription: %s\n' , turn . serverContent . outputTranscription . text ); } } session . close (); } async function \ No newline at end of file diff --git a/docstore/fa1d091b-2835-4f19-a602-ac6187c45071 b/docstore/fa1d091b-2835-4f19-a602-ac6187c45071 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/fa1d091b-2835-4f19-a602-ac6187c45071 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/fa5f3917-b5bf-42e8-9e30-7ac20e7310cb b/docstore/fa5f3917-b5bf-42e8-9e30-7ac20e7310cb new file mode 100644 index 0000000000000000000000000000000000000000..aef01da97801860cabcd3fb68af1ef57ccf11af0 --- /dev/null +++ b/docstore/fa5f3917-b5bf-42e8-9e30-7ac20e7310cb @@ -0,0 +1 @@ +Speech generation (text-to-speech) | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Speech generation (text-to-speech) The Gemini API can transform text input into single speaker or multi-speaker audio using native text-to-speech (TTS) generation capabilities. Text-to-speech (TTS) generation is controllable , meaning you can use natural language to structure interactions and guide the style , accent , pace , and tone of the audio. The TTS capability differs from speech generation provided through the Live API , which is designed for interactive, unstructured audio, and multimodal inputs and outputs. While the Live API excels in dynamic conversational contexts, TTS through the Gemini API is tailored for scenarios that require exact text recitation with fine-grained control over style and sound, such as podcast or audiobook generation. This guide shows you how to generate single-speaker and multi-speaker audio from text. Preview: Native text-to-speech (TTS) is in Preview . Before you begin Ensure you use a Gemini 2.5 model variant with native text-to-speech (TTS) capabilities, as listed in the Supported models section. For optimal results, consider which model best fits your specific use case. You may find it useful to test the Gemini 2.5 TTS models in AI Studio before you start building. Note: TTS models accept text-only inputs and produce audio-only outputs. For a complete list of restrictions specific to TTS models, review the Limitations section. Single-speaker text-to-speech To convert text to single-speaker audio, set the response modality to "audio", and pass a SpeechConfig object with VoiceConfig set. You'll need to choose a \ No newline at end of file diff --git a/docstore/fa638486-ec1e-42c2-87fe-58d70a83ffc5 b/docstore/fa638486-ec1e-42c2-87fe-58d70a83ffc5 new file mode 100644 index 0000000000000000000000000000000000000000..951343b7e154dfc2bc312960fc158259d984c283 --- /dev/null +++ b/docstore/fa638486-ec1e-42c2-87fe-58d70a83ffc5 @@ -0,0 +1 @@ +client . aio . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'Tell me a story in 300 words.' ) Chat Start a chat and send a message to the model: Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) chat = model . start_chat () response = chat . send_message ( "Tell me a story in 100 words" ) response = chat . send_message ( "What happened after that?" ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); const chat = model . startChat ({ history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); let result = await chat . sendMessage ( "I have 2 dogs in my house." ); console . log ( result . response . text ()); result = await chat . sendMessage ( "How many paws are in my house?" ); console . log ( result . response . text ()); Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( "gemini-1.5-flash" ) cs := model . StartChat () cs . History = [] * genai . Content { { Parts : [] genai . Part { genai . Text ( "Hello, I have 2 dogs in my house." ), }, Role : "user" , }, { Parts : [] genai . Part { genai . Text ( "Great to meet you. What would you like to know?" ), }, Role : "model" , }, } res , err := cs . SendMessage ( ctx , genai . Text ( "How many paws are in my house?" )) if err != nil { log . Fatal ( err ) } printResponse ( res ) // utility for printing the response After Python from google import genai client = genai . Client () chat = client . chats . create ( model = 'gemini-2.0-flash' ) response = chat . send_message ( message = 'Tell me a story in 100 words' ) response = \ No newline at end of file diff --git a/docstore/fa6b7a26-0c74-4ed2-9abd-50707f9e6814 b/docstore/fa6b7a26-0c74-4ed2-9abd-50707f9e6814 new file mode 100644 index 0000000000000000000000000000000000000000..0a87c36dd70c54ff062f2d96388cf5344fb768a1 --- /dev/null +++ b/docstore/fa6b7a26-0c74-4ed2-9abd-50707f9e6814 @@ -0,0 +1 @@ +process the response and check for Function Call, if Yes : Extract the name and args of the function and execute the corresponding function in your application. No: The model has provided a direct text response to the prompt (this flow is less emphasized in the example but is a possible outcome). Create User friendly response: If a function was executed, capture the result and send it back to the model in a subsequent turn of the conversation. It will use the result to generate a final, user-friendly response that incorporates the information from the function call. This process can be repeated over multiple turns, allowing for complex interactions and workflows. The model also supports calling multiple functions in a single turn ( parallel function calling ) and in sequence ( compositional function calling ). Step 1: Define a function declaration Define a function and its declaration within your application code that allows users to set light values and make an API request. This function could call external services or APIs. Python # Define a function that the model can call to control smart lights set_light_values_declaration = { "name" : "set_light_values" , "description" : "Sets the brightness and color temperature of a light." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "integer" , "description" : "Light level from 0 to 100. Zero is off and 100 is full brightness" , }, "color_temp" : { "type" : "string" , "enum" : [ "daylight" , "cool" , "warm" ], "description" : "Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`." , }, }, "required" : [ "brightness" , "color_temp" ], }, } # This is the actual function that would be called based on the model's suggestion def set_light_values ( brightness : int , color_temp : str ) - > dict [ str , int | str ]: """Set the brightness and color temperature of a room light. (mock API). Args: brightness: Light level from 0 to 100. Zero is off and 100 is full \ No newline at end of file diff --git a/docstore/fa8da9b9-b40b-4f6b-814a-4ec1fee6bb5d b/docstore/fa8da9b9-b40b-4f6b-814a-4ec1fee6bb5d new file mode 100644 index 0000000000000000000000000000000000000000..9b0480f81615786d4da1611ac72ab96b5af43602 --- /dev/null +++ b/docstore/fa8da9b9-b40b-4f6b-814a-4ec1fee6bb5d @@ -0,0 +1 @@ +"CiIBVKhc7oDPpCaXyJKKssjqr4g3JNOSgJ/M2V+1THC1icsWCmwBVKhc7pBABbZ+zR3e9234WnWWS6GFXmf8IVwpnzjd5KYd7vyJbn/4vTorWBGayj/vbd9JPaZQjxdAIXhoE5mX/MDsQ7M9N/b0qJjHm39tYIBvS4sIWkMDHqTJqXGLzhhKtrTkfbV3RbaJEkQKmwEBVKhc7qVUgC3hfTXZLo9R3AJzUUIx50NKvJTb9B+UU+LBqgg7Nck1x5OpjWVS2R+SsveprIuYOruk2Y0H53J2OJF8qsxTdIq2si8DGW2V7WK8xyoJH5kbqd7drIw1jLb44b6lx4SMyB0VaULuTBki4d+Ljjg1tJTwR0IYMKqDLDZt9mheINsi0ZxcNjfpnDydRXdWbcSwzmK/wgqJAQFUqFzuKgNVElxs3cbO+xebr2IwcOro84nKTisi0tTp9bICPC9fTUhn3L+rvQWA+d3J1Za8at2bakrqiRj7BTh+CVO9fWQMAEQAs3ni0Z2hfaYG92tOD26E4IoZwyYEoWbfNudpH1fr5tEkyqnEGtWIh7H+XoZQ2DXeiOa+br7Zk88SrNE+trJMCogBAVSoXO5e9fBLg7hnbkmKsrzNLnQtLsQm1gNzjcjEC7nJYklYPp0KI2uGBE1PkM8XNsfllAfHVn7LzHcHNlbQ9pJ7QZTSIeG42goS971r5wNZwxaXwCTphClQh826eqJWo6A/28TtAVQWLhTx5ekbP7qb4nh1UblESZ1saxDQAEo4OKPbDzx5BgqKAQFUqFzuVyjNm5i0wN8hTDnKjfpDroEpPPTs531iFy9BOX+xDCdGHy8D+osFpaoBq6TFekQQbz4hIoUR1YEcP4zI80/cNimEeb9IcFxZTTxiNrbhbbcv0969DSMWhB+ZEqIz4vuw4GLe/xcUvqhlChQwFdgIbdOQHSHpatn5uDlktnP/bi26nKuXIwo0AVSoXO7US22OUH7d1f4abNPI0IyAvhqkPp12rbtWLx9vkOtojE8IP+xCfYtIFuZIzRNZqA==" } ] , "role" : "model" } , { "role" : "user" , "parts" : [ { "functionResponse" : { "name" : "getWeather" , "response" : { "response" : { "stringValue" : "Sunny and hot. 90 degrees Fahrenheit" } } } } ] } ] , # Remainder of request... Learn more about limitations and usage of thought signatures, and about thinking models in general, on the Thinking page. Parallel function calling In addition to single turn function calling, you can also call multiple functions at once. Parallel function calling lets you execute multiple functions at once and is used when the functions are not dependent on each other. This is useful in scenarios like gathering data from multiple independent sources, such as retrieving customer details from different databases or checking inventory levels across various warehouses or performing multiple actions such as converting your apartment into a disco. Python power_disco_ball = { "name" : "power_disco_ball" , "description" \ No newline at end of file diff --git a/docstore/fabf52b4-ff2b-43ac-afa3-c0601c1d8cca b/docstore/fabf52b4-ff2b-43ac-afa3-c0601c1d8cca new file mode 100644 index 0000000000000000000000000000000000000000..90fe65ac1e9a79ce0cb61f5f57b1f08b7b8d3cb5 --- /dev/null +++ b/docstore/fabf52b4-ff2b-43ac-afa3-c0601c1d8cca @@ -0,0 +1 @@ +state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported \ No newline at end of file diff --git a/docstore/fad377a6-8bc4-4ad6-852d-7672f0e822e7 b/docstore/fad377a6-8bc4-4ad6-852d-7672f0e822e7 new file mode 100644 index 0000000000000000000000000000000000000000..002d74f0081d5a1754ecf09d829f2e05938acd13 --- /dev/null +++ b/docstore/fad377a6-8bc4-4ad6-852d-7672f0e822e7 @@ -0,0 +1 @@ +turn_on_the_lights , turn_off_the_lights ] } ] const config = { responseModalities : [ Modality . TEXT ], tools : tools } // ... remaining model call What's next Check out more examples of using tools with the Live API in the Tool use cookbook . Get the full story on features and configurations from the Live API Capabilities guide . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/docstore/fad6e39f-3dd3-4d78-8e38-6a5b8dcea448 b/docstore/fad6e39f-3dd3-4d78-8e38-6a5b8dcea448 new file mode 100644 index 0000000000000000000000000000000000000000..2d7fb3592b6f7a6d7aadabbb231e86ce3c5bc44a --- /dev/null +++ b/docstore/fad6e39f-3dd3-4d78-8e38-6a5b8dcea448 @@ -0,0 +1 @@ +to some safety risks that can arise when using LLMs, and recommend emerging safety design and development recommendations. (Note that laws and regulations may also impose restrictions, but such considerations are beyond the scope of this guide.) The following steps are recommended when building applications with LLMs: Understanding the safety risks of your application Considering adjustments to mitigate safety risks Performing safety testing appropriate to your use case Soliciting feedback from users and monitoring usage The adjustment and testing phases should be iterative until you reach performance appropriate for your application. Understand the safety risks of your application In this context, safety is being defined as the ability of an LLM to avoid causing harm to its users, for example, by generating toxic language or content that promotes stereotypes. The models available through the Gemini API have been designed with Google’s AI principles in mind and your use of it is subject to the Generative AI Prohibited Use Policy . The API provides built-in safety filters to help address some common language model problems such as toxic language and hate speech, and striving for inclusiveness and avoidance of stereotypes. However, each application can pose a different set of risks to its users. So as the application owner, you are responsible for knowing your users and the potential harms your application may cause, and ensuring that your application uses LLMs safely and responsibly. As part of this assessment, you should consider the likelihood that harm could occur and determine its seriousness and mitigation steps. For example, an app that generates essays based on factual events would need to be more careful about avoiding misinformation, as compared to an app that generates fictional stories for entertainment. A good way to begin exploring potential safety risks is to research your end users, and others who might be affected by your application's results. This \ No newline at end of file diff --git a/docstore/faf7299b-7b82-46fd-9ee9-bc57ed84a126 b/docstore/faf7299b-7b82-46fd-9ee9-bc57ed84a126 new file mode 100644 index 0000000000000000000000000000000000000000..1b38806cd93f51329872622740eee5221d7a7d7e --- /dev/null +++ b/docstore/faf7299b-7b82-46fd-9ee9-bc57ed84a126 @@ -0,0 +1 @@ +18°C." , config = config , ) # Print the final, user-facing response print ( response . text ) Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . I 've set the thermostat to 20°C. JavaScript This example shows how to use JavaScript/TypeScript SDK to do comopositional function calling using a manual execution loop. import { GoogleGenAI , Type } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Example Functions function get_weather_forecast ({ location }) { console . log ( `Tool Call: get_weather_forecast(location= ${ location } )` ); // TODO: Make API call console . log ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ); return { temperature : 25 , unit : "celsius" }; } function set_thermostat_temperature ({ temperature }) { console . log ( `Tool Call: set_thermostat_temperature(temperature= ${ temperature } )` , ); // TODO: Make API call console . log ( "Tool Response: {'status': 'success'}" ); return { status : "success" }; } const toolFunctions = { get_weather_forecast , set_thermostat_temperature , }; const tools = [ { functionDeclarations : [ { name : "get_weather_forecast" , description : "Gets the current weather temperature for a given location." , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , }, }, required : [ "location" ], }, }, { name : "set_thermostat_temperature" , description : "Sets the thermostat to a desired temperature." , parameters : { type : Type . OBJECT , properties : { temperature : { type : Type . NUMBER , }, }, required : [ \ No newline at end of file diff --git a/docstore/fb0459cb-9355-46d2-a433-9972928e57a3 b/docstore/fb0459cb-9355-46d2-a433-9972928e57a3 new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/fb0459cb-9355-46d2-a433-9972928e57a3 @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/fb0608ea-63dd-4f42-9d4a-c98d04f94242 b/docstore/fb0608ea-63dd-4f42-9d4a-c98d04f94242 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/fb0608ea-63dd-4f42-9d4a-c98d04f94242 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/fb14d5b8-5251-4199-8154-1ec1b971d3b9 b/docstore/fb14d5b8-5251-4199-8154-1ec1b971d3b9 new file mode 100644 index 0000000000000000000000000000000000000000..42fbfa8d3a1b9c27b4f54909cff17ace224a9de6 --- /dev/null +++ b/docstore/fb14d5b8-5251-4199-8154-1ec1b971d3b9 @@ -0,0 +1 @@ +over a happy ' 'futuristic scifi city with lots of greenery?' ) response = client . models . generate_content ( model = "gemini-2.0-flash-preview-image-generation" , contents = contents , config = types . GenerateContentConfig ( response_modalities = [ 'TEXT' , 'IMAGE' ] ) ) for part in response . candidates [ 0 ] . content . parts : if part . text is not None : print ( part . text ) elif part . inline_data is not None : image = Image . open ( BytesIO (( part . inline_data . data ))) image . save ( 'gemini-native-image.png' ) image . show () JavaScript Note: We've released the Google SDK for TypeScript and JavaScript in preview launch stage . Use this SDK for image generation features. import { GoogleGenAI , Modality } from "@google/genai" ; import * as fs from "node:fs" ; async function main () { const ai = new GoogleGenAI ({}); const contents = "Hi, can you create a 3d rendered image of a pig " + "with wings and a top hat flying over a happy " + "futuristic scifi city with lots of greenery?" ; // Set responseModalities to include "Image" so the model can generate an image const response = await ai . models . generateContent ({ model : "gemini-2.0-flash-preview-image-generation" , contents : contents , config : { responseModalities : [ Modality . TEXT , Modality . IMAGE ], }, }); for ( const part of response . candidates [ 0 ]. content . parts ) { // Based on the part type, either show the text or save the image if ( part . text ) { console . log ( part . text ); } else if ( part . inlineData ) { const imageData = part . inlineData . data ; const buffer = Buffer . from ( imageData , "base64" ); fs . writeFileSync ( "gemini-native-image.png" , buffer ); console . log ( "Image saved as gemini-native-image.png" ); } } } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . \ No newline at end of file diff --git a/docstore/fb2725de-e096-4587-a588-6da594069d10 b/docstore/fb2725de-e096-4587-a588-6da594069d10 new file mode 100644 index 0000000000000000000000000000000000000000..b3339b694e68c4d7176324567b6e6d7542786980 --- /dev/null +++ b/docstore/fb2725de-e096-4587-a588-6da594069d10 @@ -0,0 +1 @@ +YouTube video per day. For the paid tier, there is no limit based on video length. For models before 2.5, you can upload only 1 video per request. For models after 2.5, you can upload a maximum of 10 videos per request. You can only upload public videos (not private or unlisted videos). The following example shows how to include a YouTube URL with a prompt: Python response = client . models . generate_content ( model = 'models/gemini-2.0-flash' , contents = types . Content ( parts = [ types . Part ( file_data = types . FileData ( file_uri = 'https://www.youtube.com/watch?v=9hE5-98ZeCg' ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( process . env . GOOGLE_API_KEY ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-pro" }); const result = await model . generateContent ([ "Please summarize the video in 3 sentences." , { fileData : { fileUri : "https://www.youtube.com/watch?v=9hE5-98ZeCg" , }, }, ]); console . log ( result . response . text ()); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } parts := [] * genai . Part { genai . NewPartFromText ( "Please summarize the video in 3 sentences." ), genai . NewPartFromURI ( "https://www.youtube.com/watch?v=9hE5-98ZeCg" , "video/mp4" ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Please summarize the video \ No newline at end of file diff --git a/docstore/fb372974-0316-43da-b4d0-147ce83ce73a b/docstore/fb372974-0316-43da-b4d0-147ce83ce73a new file mode 100644 index 0000000000000000000000000000000000000000..bbc4019685cdf16085ca79e5df30b3ebeb71657f --- /dev/null +++ b/docstore/fb372974-0316-43da-b4d0-147ce83ce73a @@ -0,0 +1 @@ +"role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . server_content . model_turn : print ( "Model turn:" , response . server_content . model_turn ) if response . server_content . output_transcription : print ( "Transcript:" , response . server_content . output_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ], outputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . debug ( 'Received output transcription: %s\n' , turn . serverContent . outputTranscription . text ); } } session . close (); } async function \ No newline at end of file diff --git a/docstore/fb84a821-4f29-4179-a379-2dec93298a7f b/docstore/fb84a821-4f29-4179-a379-2dec93298a7f new file mode 100644 index 0000000000000000000000000000000000000000..f65bfb5d195a3160683160d98bf38afd321eba5f --- /dev/null +++ b/docstore/fb84a821-4f29-4179-a379-2dec93298a7f @@ -0,0 +1 @@ +Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/fbb08b81-9c3c-454e-9e1e-3eb043c7df19 b/docstore/fbb08b81-9c3c-454e-9e1e-3eb043c7df19 new file mode 100644 index 0000000000000000000000000000000000000000..665a477ea8352b1598262b3124a473a18fa8289a --- /dev/null +++ b/docstore/fbb08b81-9c3c-454e-9e1e-3eb043c7df19 @@ -0,0 +1 @@ +professional, detailed The following are a few examples of prompts without quality modifiers and the same prompt with quality modifiers. Prompt (no quality modifiers): a photo of a corn stalk Prompt (with quality modifiers): 4k HDR beautiful photo of a corn stalk taken by a professional photographer Image source: Each image was generated using its corresponding text prompt with the Imagen 3 model. Aspect ratios Imagen image generation lets you set five distinct image aspect ratios. Square (1:1, default) - A standard square photo. Common uses for this aspect ratio include social media posts. Fullscreen (4:3) - This aspect ratio is commonly used in media or film. It is also the dimensions of most old (non-widescreen) TVs and medium format cameras. It captures more of the scene horizontally (compared to 1:1), making it a preferred aspect ratio for photography. Prompt: close up of a musician's fingers playing the piano, black and white film, vintage (4:3 aspect ratio) Prompt: A professional studio photo of french fries for a high end restaurant, in the style of a food magazine (4:3 aspect ratio) Portrait full screen (3:4) - This is the fullscreen aspect ratio rotated 90 degrees. This lets to capture more of the scene vertically compared to the 1:1 aspect ratio. Prompt: a woman hiking, close of her boots reflected in a puddle, large mountains in the background, in the style of an advertisement, dramatic angles (3:4 aspect ratio) Prompt: aerial shot of a river flowing up a mystical valley (3:4 aspect ratio) Widescreen (16:9) - This ratio has replaced 4:3 and is now the most common aspect ratio for TVs, monitors, and mobile phone screens (landscape). Use this aspect ratio when you want to capture more of the background (for example, scenic landscapes). Prompt: a man wearing all white clothing sitting on the beach, close up, golden hour lighting (16:9 aspect ratio) Portrait (9:16) - This ratio is widescreen but rotated. This a relatively new aspect ratio that has been \ No newline at end of file diff --git a/docstore/fbe6e0b1-35c7-4e4e-8fac-8c98eb5bd294 b/docstore/fbe6e0b1-35c7-4e4e-8fac-8c98eb5bd294 new file mode 100644 index 0000000000000000000000000000000000000000..a97f8e9b4e6ce1d85f8c4b9bed31a9676bb7c602 --- /dev/null +++ b/docstore/fbe6e0b1-35c7-4e4e-8fac-8c98eb5bd294 @@ -0,0 +1 @@ +Files API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Files API The Gemini family of artificial intelligence (AI) models is built to handle various types of input data, including text, images, and audio. Since these models can handle more than one type or mode of data, the Gemini models are called multimodal models or explained as having multimodal capabilities . This guide shows you how to work with media files using the Files API. The basic operations are the same for audio files, images, videos, documents, and other supported file types. For file prompting guidance, check out the File prompt guide section. Upload a file You can use the Files API to upload a media file. Always use the Files API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20 MB. The following code uploads a file and then uses the file in a call to generateContent . Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp3" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ "Describe this audio clip" , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Describe this audio \ No newline at end of file diff --git a/docstore/fbee04ac-6a06-42f3-b4a1-a5553ffec271 b/docstore/fbee04ac-6a06-42f3-b4a1-a5553ffec271 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/fbee04ac-6a06-42f3-b4a1-a5553ffec271 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/fbf15e40-0b9a-43ad-a489-76b997820272 b/docstore/fbf15e40-0b9a-43ad-a489-76b997820272 new file mode 100644 index 0000000000000000000000000000000000000000..af04e93a60e88c829b3aa07a575f51b8b1fa9ccf --- /dev/null +++ b/docstore/fbf15e40-0b9a-43ad-a489-76b997820272 @@ -0,0 +1 @@ +TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : false , // default startOfSpeechSensitivity : StartSensitivity . START_SENSITIVITY_LOW , endOfSpeechSensitivity : EndSensitivity . END_SENSITIVITY_LOW , prefixPaddingMs : 20 , silenceDurationMs : 100 , } } }; Disable automatic VAD Alternatively, the automatic VAD can be disabled by setting realtimeInputConfig.automaticActivityDetection.disabled to true in the setup message. In this configuration the client is responsible for detecting user speech and sending activityStart and activityEnd messages at the appropriate times. An audioStreamEnd isn't sent in this configuration. Instead, any interruption of the stream is marked by an activityEnd message. Python config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : True }}, } async with client . aio . live . connect ( model = model , config = config ) as session : # ... await session . send_realtime_input ( activity_start = types . ActivityStart ()) await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) await session . send_realtime_input ( activity_end = types . ActivityEnd ()) # ... JavaScript const config = { responseModalities : [ Modality . TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : true , } } }; session . sendRealtimeInput ({ activityStart : {} }) session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); session . sendRealtimeInput ({ activityEnd : {} }) Token count You can find the total number of consumed tokens in the usageMetadata field of the returned server message. Python async for message in session . receive (): # The server will periodically send messages that include UsageMetadata. if message . usage_metadata : usage = message . usage_metadata print ( f "Used { usage . total_token_count } tokens in total. Response token \ No newline at end of file diff --git a/docstore/fc166d2c-e978-4f51-86ef-2bfc22aba695 b/docstore/fc166d2c-e978-4f51-86ef-2bfc22aba695 new file mode 100644 index 0000000000000000000000000000000000000000..2dd19cc1c2d77863f4a301d62456871308dd74fd --- /dev/null +++ b/docstore/fc166d2c-e978-4f51-86ef-2bfc22aba695 @@ -0,0 +1 @@ +"temperature" ], }, }, ], }, ]; // Prompt for the model let contents = [ { role : "user" , parts : [ { text : "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , }, ], }, ]; // Loop until the model has no more function calls to make while ( true ) { const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents , config : { tools }, }); if ( result . functionCalls && result . functionCalls . length > 0 ) { const functionCall = result . functionCalls [ 0 ]; const { name , args } = functionCall ; if ( ! toolFunctions [ name ]) { throw new Error ( `Unknown function call: ${ name } ` ); } // Call the function and get the response. const toolResponse = toolFunctions [ name ]( args ); const functionResponsePart = { name : functionCall . name , response : { result : toolResponse , }, }; // Send the function response back to the model. contents . push ({ role : "model" , parts : [ { functionCall : functionCall , }, ], }); contents . push ({ role : "user" , parts : [ { functionResponse : functionResponsePart , }, ], }); } else { // No more function calls, break the loop. console . log ( result . text ); break ; } } Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . It 's 25°C in London, so I' ve set the thermostat to 20 ° C . Compositional function calling is a native Live API feature. This means Live API can handle the function calling similar to the Python SDK. Python # Light control schemas turn_on_the_lights_schema = { 'name' : 'turn_on_the_lights' } \ No newline at end of file diff --git a/docstore/fc1d5d3e-e69d-4544-8468-d9c02c7a2c91 b/docstore/fc1d5d3e-e69d-4544-8468-d9c02c7a2c91 new file mode 100644 index 0000000000000000000000000000000000000000..398c27f81f98fb70fd75971ce8544e21d251500f --- /dev/null +++ b/docstore/fc1d5d3e-e69d-4544-8468-d9c02c7a2c91 @@ -0,0 +1 @@ +Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. \ No newline at end of file diff --git a/docstore/fc24150a-6c1b-4f5f-a0a0-15f9e02a245d b/docstore/fc24150a-6c1b-4f5f-a0a0-15f9e02a245d new file mode 100644 index 0000000000000000000000000000000000000000..5b31a2c588785b0dc19769f45b0589a09f2843d3 --- /dev/null +++ b/docstore/fc24150a-6c1b-4f5f-a0a0-15f9e02a245d @@ -0,0 +1 @@ +world knowledge and reasoning. Seamlessly blending text and images is important. You want accurate visuals embedded within long text sequences. You want to edit images conversationally while maintaining context. Choose Imagen when: Image quality, photorealism, artistic detail, or specific styles (e.g., impressionism, anime) are top priorities. Performing specialized editing tasks like product background updates or image upscaling. Infusing branding, style, or generating logos and product designs. Imagen 4 should be your go-to model starting to generate images with Imagen. Choose Imagen 4 Ultra for advanced use-cases or when you need the best image quality. Note that Imagen 4 Ultra can only generate one image at a time. Imagen prompt guide This section of the Imagen guide shows you how modifying a text-to-image prompt can produce different results, along with examples of images you can create. Prompt writing basics Note: Maximum prompt length is 480 tokens. A good prompt is descriptive and clear, and makes use of meaningful keywords and modifiers. Start by thinking of your subject , context , and style . Image text: A sketch ( style ) of a modern apartment building ( subject ) surrounded by skyscrapers ( context and background ). Subject : The first thing to think about with any prompt is the subject : the object, person, animal, or scenery you want an image of. Context and background: Just as important is the background or context in which the subject will be placed. Try placing your subject in a variety of backgrounds. For example, a studio with a white background, outdoors, or indoor environments. Style: Finally, add the style of image you want. Styles can be general (painting, photograph, sketches) or very specific (pastel painting, charcoal drawing, isometric 3D). You can also combine styles. After you write a first version of your prompt, refine your prompt by adding more details until you get to the image that you want. Iteration is important. Start by \ No newline at end of file diff --git a/docstore/fc366b39-2223-425f-a325-e7234caefed4 b/docstore/fc366b39-2223-425f-a325-e7234caefed4 new file mode 100644 index 0000000000000000000000000000000000000000..f039b5ac5d90852c6e127ae22e04141bbc717563 --- /dev/null +++ b/docstore/fc366b39-2223-425f-a325-e7234caefed4 @@ -0,0 +1 @@ +patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 \ No newline at end of file diff --git a/docstore/fc4472e9-53fd-4c92-9e1f-6b2e541c9e77 b/docstore/fc4472e9-53fd-4c92-9e1f-6b2e541c9e77 new file mode 100644 index 0000000000000000000000000000000000000000..71a40ddf8f5869779c79b3a864f86f411dc17d60 --- /dev/null +++ b/docstore/fc4472e9-53fd-4c92-9e1f-6b2e541c9e77 @@ -0,0 +1 @@ +correct reasoning steps afterward. To disambiguate between those reasons, ask the model to describe what's in the image. In the following example, if the model responds with a snack that seems surprising when paired with tea (e.g. popcorn), you can first troubleshoot to determine whether the model correctly recognized that the image contains tea. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? Describe what's in this image. Another strategy is to ask the model to explain its reasoning. That can help you narrow down which part of the reasoning broke down, if any. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? What's a snack I can make in 1 minute that would go well with this? Please explain why. What's next Try writing your own multimodal prompts using Google AI Studio . For information on using the Gemini Files API for uploading media files and including them in your prompts, see the Vision , Audio , and Document processing guides. For more guidance on prompt design, like tuning sampling parameters, see the Prompt strategies page. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/fc7087e1-d58a-4977-ad78-c05ee95f15bc b/docstore/fc7087e1-d58a-4977-ad78-c05ee95f15bc new file mode 100644 index 0000000000000000000000000000000000000000..8450328bfc49bb62b82cf9cb2ac9ca979a8f4eef --- /dev/null +++ b/docstore/fc7087e1-d58a-4977-ad78-c05ee95f15bc @@ -0,0 +1 @@ +Python import asyncio import wave from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : wf = wave . open ( "audio.wav" , "wb" ) wf . setnchannels ( 1 ) wf . setsampwidth ( 2 ) wf . setframerate ( 24000 ) message = "Hello how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . data is not None : wf . writeframes ( response . data ) # Un-comment this code to print audio data info # if response.server_content.model_turn is not None: # print(response.server_content.model_turn.parts[0].inline_data.mime_type) wf . close () if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : \ No newline at end of file diff --git a/docstore/fc8ee811-13cb-49d3-ab9f-adebe4504111 b/docstore/fc8ee811-13cb-49d3-ab9f-adebe4504111 new file mode 100644 index 0000000000000000000000000000000000000000..a1f30e86f8de69da772a1b833567cf406e31d0a4 --- /dev/null +++ b/docstore/fc8ee811-13cb-49d3-ab9f-adebe4504111 @@ -0,0 +1 @@ +temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . It 's 25°C in London, so I' ve set the thermostat to 20 ° C . Compositional function calling is a native Live API feature. This means Live API can handle the function calling similar to the Python SDK. Python # Light control schemas turn_on_the_lights_schema = { 'name' : 'turn_on_the_lights' } turn_off_the_lights_schema = { 'name' : 'turn_off_the_lights' } prompt = """ Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? """ tools = [ { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} ] await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Light control schemas const turnOnTheLightsSchema = { name : 'turn_on_the_lights' }; const turnOffTheLightsSchema = { name : 'turn_off_the_lights' }; const prompt = ` Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? ` ; const tools = [ { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } ]; await run ( prompt , tools = tools , modality = "AUDIO" ) Function calling modes The Gemini API lets you control how the model uses the provided tools (function declarations). Specifically, you can set the mode within the. function_calling_config . AUTO (Default) : The model decides whether to generate a natural language response or suggest a function call based on the prompt and context. This is the most flexible mode and recommended for most scenarios. ANY : The model is constrained to always predict a function call and guarantees function schema adherence. If allowed_function_names is \ No newline at end of file diff --git a/docstore/fc95b1d5-5d29-4334-8559-363b95d02c51 b/docstore/fc95b1d5-5d29-4334-8559-363b95d02c51 new file mode 100644 index 0000000000000000000000000000000000000000..ba9fb868f99e81b20779165b803150afffeabaec --- /dev/null +++ b/docstore/fc95b1d5-5d29-4334-8559-363b95d02c51 @@ -0,0 +1 @@ +Generate video using Veo | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Generate video using Veo The Gemini API provides access to Veo 2 , Google's most capable video generation model to date. Veo generates videos in a wide range of cinematic and visual styles, capturing prompt nuance to render intricate details consistently across frames. This guide will help you get started with Veo using the Gemini API. For video prompting guidance, check out the Veo prompt guide section. Note: Veo is a paid feature and will not run in the Free tier. Visit the Pricing page for more details. Before you begin Before calling the Gemini API, ensure you have your SDK of choice installed, and a Gemini API key configured and ready to use. To use Veo with the Google Gen AI SDKs, ensure that you have one of the following versions installed: Python v1.10.0 or later TypeScript and JavaScript v0.8.0 or later Go v1.0.0 or later Generate videos This section provides code examples for generating videos using text prompts and using images . Generate from text You can use the following code to generate videos with Veo: Python import time from google import genai from google.genai import types client = genai . Client () operation = client . models . generate_videos ( model = "veo-2.0-generate-001" , prompt = "Panning wide shot of a calico kitten sleeping in the sunshine" , config = types . GenerateVideosConfig ( person_generation = "dont_allow" , # "dont_allow" or "allow_adult" aspect_ratio = "16:9" , # "16:9" or "9:16" ), ) while not operation . done : time . sleep ( 20 ) operation = client . operations . get ( operation ) for n , generated_video in enumerate ( \ No newline at end of file diff --git a/docstore/fca1d256-398b-4876-a757-4dfe711da8a5 b/docstore/fca1d256-398b-4876-a757-4dfe711da8a5 new file mode 100644 index 0000000000000000000000000000000000000000..b58c189db2b124c61872134bdd4b3d786a4e18e6 --- /dev/null +++ b/docstore/fca1d256-398b-4876-a757-4dfe711da8a5 @@ -0,0 +1 @@ +you can read about configuring function calling . Python from google import genai from google.genai import types # Configure the client and tools client = genai . Client () house_tools = [ types . Tool ( function_declarations = [ power_disco_ball , start_music , dim_lights ]) ] config = types . GenerateContentConfig ( tools = house_tools , automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ), # Force the model to call 'any' function, instead of chatting. tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = 'ANY' ) ), ) chat = client . chats . create ( model = "gemini-2.5-flash" , config = config ) response = chat . send_message ( "Turn this place into a party!" ) # Print out each of the function calls requested from this single call print ( "Example 1: Forced function calling" ) for fn in response . function_calls : args = ", " . join ( f " { key } = { val } " for key , val in fn . args . items ()) print ( f " { fn . name } ( { args } )" ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Set up function declarations const houseFns = [ powerDiscoBall , startMusic , dimLights ]; const config = { tools : [{ functionDeclarations : houseFns }], // Force the model to call 'any' function, instead of chatting. toolConfig : { functionCallingConfig : { mode : 'any' } } }; // Configure the client const ai = new GoogleGenAI ({}); // Create a chat session const chat = ai . chats . create ({ model : 'gemini-2.5-flash' , config : config }); const response = await chat . sendMessage ({ message : 'Turn this place into a party!' }); // Print out each of the function calls requested from this single call console . log ( "Example 1: Forced function calling" ); for ( const fn of response . functionCalls ) { const args = Object . entries ( fn . args ) . map (([ key , val ]) = > ` ${ key } = ${ val } ` ) . join ( ', ' ); console . log ( ` ${ fn . name } ( ${ args } )` ); } Each of the printed \ No newline at end of file diff --git a/docstore/fcab7964-2336-4b6e-b974-8ff78a759f01 b/docstore/fcab7964-2336-4b6e-b974-8ff78a759f01 new file mode 100644 index 0000000000000000000000000000000000000000..1b590529498df147a73d1ae4a22f439d08a36cb6 --- /dev/null +++ b/docstore/fcab7964-2336-4b6e-b974-8ff78a759f01 @@ -0,0 +1 @@ +from "node:fs" ; const ai = new GoogleGenAI ({}); const base64ImageFile = fs . readFileSync ( "path/to/small-sample.jpg" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "image/jpeg" , data : base64ImageFile , }, }, { text : "Caption this image." }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go bytes , _ := os . ReadFile ( "path/to/small-sample.jpg" ) parts := [] * genai . Part { genai . NewPartFromBytes ( bytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST IMG_PATH = "/path/to/your/image1.jpg" if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "' " $( base64 $B64FLAGS $IMG_PATH ) " '" } }, {"text": "Caption this image."}, ] }] }' 2 > /dev/null You can also fetch an image from a URL, convert it to bytes, and pass it to generateContent as shown in the following examples. Python from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests . get ( image_path ) . content image = types . Part . from_bytes ( data = image_bytes , mime_type = "image/jpeg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is this image?" , image ], ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; \ No newline at end of file diff --git a/docstore/fcd9f62b-d855-4280-bc60-fb52210178e8 b/docstore/fcd9f62b-d855-4280-bc60-fb52210178e8 new file mode 100644 index 0000000000000000000000000000000000000000..f71ac6c85727e3c520290c703b52e420cb1baa33 --- /dev/null +++ b/docstore/fcd9f62b-d855-4280-bc60-fb52210178e8 @@ -0,0 +1 @@ +(JSONL) file. Each line in this file must be a JSON object containing a user-defined key and a request object, where the request is a valid GenerateContentRequest object. The user-defined key is used in the response to indicate which output is the result of which request. For example, the request with the key defined as request-1 will have its response annotated with the same key name. This file is uploaded using the File API . The maximum allowed file size for an input file is 2GB. The following is an example of a JSONL file. You can save it in a file named my-batch-requests.json : { "key" : "request-1" , "request" : { "contents" : [{ "parts" : [{ "text" : "Describe the process of photosynthesis." }]}], "generation_config" : { "temperature" : 0.7 }}} { "key" : "request-2" , "request" : { "contents" : [{ "parts" : [{ "text" : "What are the main ingredients in a Margherita pizza?" }]}]}} Similarly to inline requests, you can specify other parameters like system instructions, tools or other configurations in each request JSON. You can upload this file using the File API as shown in the following example. If you are working with multimodal input, you can reference other uploaded files within your JSONL file. Python from google import genai from google.genai import types client = genai . Client () # Create a sample JSONL file with open ( "my-batch-requests.jsonl" , "w" ) as f : requests = [ { "key" : "request-1" , "request" : { "contents" : [{ "parts" : [{ "text" : "Describe the process of photosynthesis." }]}]}}, { "key" : "request-2" , "request" : { "contents" : [{ "parts" : [{ "text" : "What are the main ingredients in a Margherita pizza?" }]}]}} ] for req in requests : f . write ( json . dumps ( req ) + " \n " ) # Upload the file to the File API uploaded_file = client . files . upload ( file = 'my-batch-requests.jsonl' , config = types . UploadFileConfig ( display_name = 'my-batch-requests' , mime_type = 'jsonl' ) ) print ( f "Uploaded file: { uploaded_file . name } \ No newline at end of file diff --git a/docstore/fcf6cfdb-6c26-45b0-b6f9-7607799847e2 b/docstore/fcf6cfdb-6c26-45b0-b6f9-7607799847e2 new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/fcf6cfdb-6c26-45b0-b6f9-7607799847e2 @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/fd0017cd-7ef4-4c3c-adb7-0c6f3211556f b/docstore/fd0017cd-7ef4-4c3c-adb7-0c6f3211556f new file mode 100644 index 0000000000000000000000000000000000000000..0abe7c770a1c93708b98ee8b0a34df5d347d5c9d --- /dev/null +++ b/docstore/fd0017cd-7ef4-4c3c-adb7-0c6f3211556f @@ -0,0 +1 @@ +candidates [ 0 ]. content ; contents . push ( function_response_content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); The following shows what a request returning a thought signature may look like: [{ "contents" : [ { "role" : "user" , "parts" : [ { "text" : "what is the weather in Lake Tahoe?" } ] } , { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : "CiIBVKhc7oDPpCaXyJKKssjqr4g3JNOSgJ/M2V+1THC1icsWCmwBVKhc7pBABbZ+zR3e9234WnWWS6GFXmf8IVwpnzjd5KYd7vyJbn/4vTorWBGayj/vbd9JPaZQjxdAIXhoE5mX/MDsQ7M9N/b0qJjHm39tYIBvS4sIWkMDHqTJqXGLzhhKtrTkfbV3RbaJEkQKmwEBVKhc7qVUgC3hfTXZLo9R3AJzUUIx50NKvJTb9B+UU+LBqgg7Nck1x5OpjWVS2R+SsveprIuYOruk2Y0H53J2OJF8qsxTdIq2si8DGW2V7WK8xyoJH5kbqd7drIw1jLb44b6lx4SMyB0VaULuTBki4d+Ljjg1tJTwR0IYMKqDLDZt9mheINsi0ZxcNjfpnDydRXdWbcSwzmK/wgqJAQFUqFzuKgNVElxs3cbO+xebr2IwcOro84nKTisi0tTp9bICPC9fTUhn3L+rvQWA+d3J1Za8at2bakrqiRj7BTh+CVO9fWQMAEQAs3ni0Z2hfaYG92tOD26E4IoZwyYEoWbfNudpH1fr5tEkyqnEGtWIh7H+XoZQ2DXeiOa+br7Zk88SrNE+trJMCogBAVSoXO5e9fBLg7hnbkmKsrzNLnQtLsQm1gNzjcjEC7nJYklYPp0KI2uGBE1PkM8XNsfllAfHVn7LzHcHNlbQ9pJ7QZTSIeG42goS971r5wNZwxaXwCTphClQh826eqJWo6A/28TtAVQWLhTx5ekbP7qb4nh1UblESZ1saxDQAEo4OKPbDzx5BgqKAQFUqFzuVyjNm5i0wN8hTDnKjfpDroEpPPTs531iFy9BOX+xDCdGHy8D+osFpaoBq6TFekQQbz4hIoUR1YEcP4zI80/cNimEeb9IcFxZTTxiNrbhbbcv0969DSMWhB+ZEqIz4vuw4GLe/xcUvqhlChQwFdgIbdOQHSHpatn5uDlktnP/bi26nKuXIwo0AVSoXO7US22OUH7d1f4abNPI0IyAvhqkPp12rbtWLx9vkOtojE8IP+xCfYtIFuZIzRNZqA==" } ] , "role" : "model" } , { "role" : "user" , "parts" : [ { "functionResponse" : { "name" : "getWeather" , "response" : { "response" : { "stringValue" : "Sunny and hot. 90 degrees Fahrenheit" } } } } ] } ] , # Remainder of request... Learn more about limitations and usage of thought signatures, and about \ No newline at end of file diff --git a/docstore/fd14c72f-dafa-435b-8dbb-ceb7c78de684 b/docstore/fd14c72f-dafa-435b-8dbb-ceb7c78de684 new file mode 100644 index 0000000000000000000000000000000000000000..45d17d6ac3f5b7951c085e76c40e76fbe5fe62ea --- /dev/null +++ b/docstore/fd14c72f-dafa-435b-8dbb-ceb7c78de684 @@ -0,0 +1 @@ +"thinkingConfig": { "thinkingBudget": 1024 # Thinking off: # "thinkingBudget": 0 # Turn on dynamic thinking: # "thinkingBudget": -1 } } }' Thought summaries Thought summaries are synthesized versions of the model's raw thoughts and offer insights into the model's internal reasoning process. Note that thinking budgets apply to the model's raw thoughts and not to thought summaries. You can enable thought summaries by setting includeThoughts to true in your request configuration. You can then access the summary by iterating through the response parameter's parts , and checking the thought boolean. Here's an example demonstrating how to enable and retrieve thought summaries without streaming, which returns a single, final thought summary with the response: Python from google import genai from google.genai import types client = genai . Client () prompt = "What is the sum of the first 50 prime numbers?" response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = prompt , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( include_thoughts = True ) ) ) for part in response . candidates [ 0 ] . content . parts : if not part . text : continue if part . thought : print ( "Thought summary:" ) print ( part . text ) print () else : print ( "Answer:" ) print ( part . text ) print () JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : "What is the sum of the first 50 prime numbers?" , config : { thinkingConfig : { includeThoughts : true , }, }, }); for ( const part of response . candidates [ 0 ]. content . parts ) { if ( ! part . text ) { continue ; } else if ( part . thought ) { console . log ( "Thoughts summary:" ); console . log ( part . text ); } else { console . log ( "Answer:" ); console . log ( part . text ); } } } main (); Go package main import ( "context" \ No newline at end of file diff --git a/docstore/fd180298-fa4c-4b12-96fb-cf356186d04d b/docstore/fd180298-fa4c-4b12-96fb-cf356186d04d new file mode 100644 index 0000000000000000000000000000000000000000..37fa730aa9280f3cac34df0c8f8ecdd2b308e691 --- /dev/null +++ b/docstore/fd180298-fa4c-4b12-96fb-cf356186d04d @@ -0,0 +1 @@ +operation . response . generated_videos ): client . files . download ( file = generated_video . video ) generated_video . video . save ( f "video { n } .mp4" ) # save the video JavaScript import { GoogleGenAI } from "@google/genai" ; import { createWriteStream } from "fs" ; import { Readable } from "stream" ; const ai = new GoogleGenAI ({}); async function main () { let operation = await ai . models . generateVideos ({ model : "veo-2.0-generate-001" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , config : { personGeneration : "dont_allow" , aspectRatio : "16:9" , }, }); while ( ! operation . done ) { await new Promise (( resolve ) = > setTimeout ( resolve , 10000 )); operation = await ai . operations . getVideosOperation ({ operation : operation , }); } operation . response ? . generatedVideos ? . forEach ( async ( generatedVideo , n ) = > { const resp = await fetch ( ` ${ generatedVideo . video ? . uri } & key=GEMINI_API_KEY` ); // append your API key const writer = createWriteStream ( `video ${ n } .mp4` ); Readable . fromWeb ( resp . body ). pipe ( writer ); }); } main (); Go package main import ( "context" "fmt" "os" "time" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } videoConfig := & genai . GenerateVideosConfig { AspectRatio : "16:9" , PersonGeneration : "dont_allow" , } operation , _ := client . Models . GenerateVideos ( ctx , "veo-2.0-generate-001" , "Panning wide shot of a calico kitten sleeping in the sunshine" , nil , videoConfig , ) for ! operation . Done { time . Sleep ( 20 * time . Second ) operation , _ = client . Operations . GetVideosOperation ( ctx , operation , nil ) } for n , video := range operation . Response . GeneratedVideos { client . Files . Download ( ctx , video . Video , nil ) fname := fmt . Sprintf ( "video_%d.mp4" , n ) _ = os . WriteFile ( fname , video . Video . VideoBytes , 0644 ) } } REST # \ No newline at end of file diff --git a/docstore/fd3184f7-ab25-488c-b3e7-bd8eca044559 b/docstore/fd3184f7-ab25-488c-b3e7-bd8eca044559 new file mode 100644 index 0000000000000000000000000000000000000000..0d39ec227c9e73c3d3b5117917b80f593b500b4b --- /dev/null +++ b/docstore/fd3184f7-ab25-488c-b3e7-bd8eca044559 @@ -0,0 +1 @@ +async function main () { const ai = new GoogleGenAI ({}); const imageUrl = "https://goo.gle/instrument-img" ; const response = await fetch ( imageUrl ); const imageArrayBuffer = await response . arrayBuffer (); const base64ImageData = Buffer . from ( imageArrayBuffer ). toString ( 'base64' ); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ { inlineData : { mimeType : 'image/jpeg' , data : base64ImageData , }, }, { text : "Caption this image." } ], }); console . log ( result . text ); } main (); Go package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Download the image. imageResp , _ := http . Get ( "https://goo.gle/instrument-img" ) imageBytes , _ := io . ReadAll ( imageResp . Body ) parts := [] * genai . Part { genai . NewPartFromBytes ( imageBytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMG_URL = "https://goo.gle/instrument-img" MIME_TYPE = $( curl -sIL " $IMG_URL " | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1 ) if [[ -z " $MIME_TYPE " || ! " $MIME_TYPE " == image/* ]] ; then MIME_TYPE = "image/jpeg" fi # Check for macOS if [[ " $( uname ) " == "Darwin" ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -b 0 ) elif [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 ) else IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -w0 ) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ \ No newline at end of file diff --git a/docstore/fd3fe187-779a-457c-a0bd-13919d52cae9 b/docstore/fd3fe187-779a-457c-a0bd-13919d52cae9 new file mode 100644 index 0000000000000000000000000000000000000000..1540f812946831d7975c774be202eaa83cf52504 --- /dev/null +++ b/docstore/fd3fe187-779a-457c-a0bd-13919d52cae9 @@ -0,0 +1 @@ +"CiwBVKhc7nRyTi3HmggPD9iQiRc261f5jwuMdw3H/itDH0emsb9ZVo3Nwx9p6wpsAVSoXO5i8fDV4jBSBLoaWxB5zUdlGY6aIGp+I0oEnwRRSRQ1LOvrDlojEH8JE8HjiKXALdJrvNPiG+HY3GZEO8pZjEZtc3UoBUh7+SVyjK7Xolu7aRYYeUyzrCapoETWypER1jbrJXnFV23hCosBAVSoXO6oIPNJSmbuEDfGafOhuCSHkpr1yjTp35RXYqmCESzRzWf5+nFXLqncqeFo4ohoxbiYQVpVQbOZF81p8o9zg6xeRE7qMeOv+XN7enXGJ4/s3qNFQpfkSMqRdBITN1VpX7jyfEAjvxBNc7PDfDJZmEPY338ZIY5nFFcmzJSWjVrboFt2sMFv+A==" } ] , "role" : "model" } , "finishReason" : "STOP" , "index" : 0 } ] , # Remainder of response... You can confirm that you received a signature and see what a signature looks like using the following code: # Step 2: Call the model with function declarations # ...Generation config, Configure the client, and Define user prompt (No changes) # Send request with declarations (using a thinking model) response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents ) # See thought signatures for part in response . candidates [ 0 ] . content . parts : if part . thought_signature : print ( "Thought signature:" ) print ( part . thought_signature ) Returning signatures back to the server In order to return signatures back: You should return signatures along with their containing parts back to the server You shouldn't merge a part with a signature with another part which also contains a signature. The signature string is not concatenable You shouldn't merge one part with a signature with another part without a signature. This breaks the correct positioning of the thought represented by the signature. The code will remain the same as in Step 4 of the previous section. But in this case (as indicated in the comment below) you will return signatures to the model along with the result of the function execution so the model can incorporate the thoughts into its final response: Python # Step 4: Create user friendly response with function result and call the model again # ...Create a function response part (No change) # Append thought \ No newline at end of file diff --git a/docstore/fd414c3d-62c1-4bd8-9a57-7b4e6aec5bc0 b/docstore/fd414c3d-62c1-4bd8-9a57-7b4e6aec5bc0 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/fd414c3d-62c1-4bd8-9a57-7b4e6aec5bc0 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/fd46000c-2730-4061-8164-2f29848db85e b/docstore/fd46000c-2730-4061-8164-2f29848db85e new file mode 100644 index 0000000000000000000000000000000000000000..86d48414067c1d66b488d48692d58e08ff4ae97d --- /dev/null +++ b/docstore/fd46000c-2730-4061-8164-2f29848db85e @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/libraries#main-content Title: Gemini API libraries | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/fd529830-5a1f-4a75-af41-069d98e3cd6f b/docstore/fd529830-5a1f-4a75-af41-069d98e3cd6f new file mode 100644 index 0000000000000000000000000000000000000000..1644886f172ebbc1a531a5a1c6804985c1bad374 --- /dev/null +++ b/docstore/fd529830-5a1f-4a75-af41-069d98e3cd6f @@ -0,0 +1 @@ +calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation \ No newline at end of file diff --git a/docstore/fd5a18d1-740f-407c-ac6c-d64b4e8796cb b/docstore/fd5a18d1-740f-407c-ac6c-d64b4e8796cb new file mode 100644 index 0000000000000000000000000000000000000000..7c053666ab07aea3818716f4f450806e9d4b0b54 --- /dev/null +++ b/docstore/fd5a18d1-740f-407c-ac6c-d64b4e8796cb @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/function-calling#compositional_function_calling Title: Function calling with the Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/fd5beaed-72e2-4103-9158-e2aeabea47a1 b/docstore/fd5beaed-72e2-4103-9158-e2aeabea47a1 new file mode 100644 index 0000000000000000000000000000000000000000..dcef3957feeb00af8db96f54c364a1fcfa6f1d5f --- /dev/null +++ b/docstore/fd5beaed-72e2-4103-9158-e2aeabea47a1 @@ -0,0 +1 @@ +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, \ No newline at end of file diff --git a/docstore/fd79c51a-6564-4997-a03a-d9de7c2bfc09 b/docstore/fd79c51a-6564-4997-a03a-d9de7c2bfc09 new file mode 100644 index 0000000000000000000000000000000000000000..a5c7df71403cc48e8e56352e55ad417999aabca3 --- /dev/null +++ b/docstore/fd79c51a-6564-4997-a03a-d9de7c2bfc09 @@ -0,0 +1 @@ +. getGenerativeModel ({ model : "gemini-1.5-flash" , safetySettings : [ { category : HarmCategory . HARM_CATEGORY_HARASSMENT , threshold : HarmBlockThreshold . BLOCK_LOW_AND_ABOVE , }, ], }); const unsafePrompt = "I support Martians Soccer Club and I think " + "Jupiterians Football Club sucks! Write an ironic phrase telling " + "them how I feel about them." ; const result = await model . generateContent ( unsafePrompt ); try { result . response . text (); } catch ( e ) { console . error ( e ); console . log ( result . response . candidates [ 0 ]. safetyRatings ); } After Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'say something bad' , config = types . GenerateContentConfig ( safety_settings = [ types . SafetySetting ( category = 'HARM_CATEGORY_HATE_SPEECH' , threshold = 'BLOCK_ONLY_HIGH' ), ] ), ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const unsafePrompt = "I support Martians Soccer Club and I think " + "Jupiterians Football Club sucks! Write an ironic phrase telling " + "them how I feel about them." ; const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : unsafePrompt , config : { safetySettings : [ { category : "HARM_CATEGORY_HARASSMENT" , threshold : "BLOCK_ONLY_HIGH" , }, ], }, }); console . log ( "Finish reason:" , response . candidates [ 0 ]. finishReason ); console . log ( "Safety ratings:" , response . candidates [ 0 ]. safetyRatings ); Async Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content_async ( 'tell me a story in 100 words' ) After Python To use the new SDK with asyncio , there is a separate async implementation of every method under client.aio . from google import genai client = genai . Client () response = await \ No newline at end of file diff --git a/docstore/fd9ae530-b828-4879-9d98-d8d5e54d418a b/docstore/fd9ae530-b828-4879-9d98-d8d5e54d418a new file mode 100644 index 0000000000000000000000000000000000000000..71a40ddf8f5869779c79b3a864f86f411dc17d60 --- /dev/null +++ b/docstore/fd9ae530-b828-4879-9d98-d8d5e54d418a @@ -0,0 +1 @@ +correct reasoning steps afterward. To disambiguate between those reasons, ask the model to describe what's in the image. In the following example, if the model responds with a snack that seems surprising when paired with tea (e.g. popcorn), you can first troubleshoot to determine whether the model correctly recognized that the image contains tea. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? Describe what's in this image. Another strategy is to ask the model to explain its reasoning. That can help you narrow down which part of the reasoning broke down, if any. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? What's a snack I can make in 1 minute that would go well with this? Please explain why. What's next Try writing your own multimodal prompts using Google AI Studio . For information on using the Gemini Files API for uploading media files and including them in your prompts, see the Vision , Audio , and Document processing guides. For more guidance on prompt design, like tuning sampling parameters, see the Prompt strategies page. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/fd9c0e5e-886d-4866-8076-3131a529a706 b/docstore/fd9c0e5e-886d-4866-8076-3131a529a706 new file mode 100644 index 0000000000000000000000000000000000000000..766dc5f5f1448365b15d5384874570503ea31fdc --- /dev/null +++ b/docstore/fd9c0e5e-886d-4866-8076-3131a529a706 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/text-generation#text-input Title: Text generation | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/fd9cddeb-6a88-41ec-b3ab-828926dc19b3 b/docstore/fd9cddeb-6a88-41ec-b3ab-828926dc19b3 new file mode 100644 index 0000000000000000000000000000000000000000..49e7abc34670e44391bcc66ea2ddb4f1c7cb4909 --- /dev/null +++ b/docstore/fd9cddeb-6a88-41ec-b3ab-828926dc19b3 @@ -0,0 +1 @@ +calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in \ No newline at end of file diff --git a/docstore/fda48def-a278-4639-b03a-1df69bc8d32f b/docstore/fda48def-a278-4639-b03a-1df69bc8d32f new file mode 100644 index 0000000000000000000000000000000000000000..b56dfbca2298dddb94eb66ba2a6b5e2a4d7109fa --- /dev/null +++ b/docstore/fda48def-a278-4639-b03a-1df69bc8d32f @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "' " ${ MIME1_TYPE } " '", "file_uri": ' $file1_uri '}}, { "inline_data": { "mime_type":"' " ${ MIME2_TYPE } " '", "data": "' " $IMAGE2_BASE64 " '" } } ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Object detection From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. Python from google import genai from google.genai import types from PIL import Image import json client = genai . Client () prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image . open ( "/path/to/image.png" ) config = types . GenerateContentConfig ( response_mime_type = "application/json" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , prompt ], config = config ) width , height = image . size bounding_boxes = json . loads ( response . text ) converted_bounding_boxes = [] for bounding_box in bounding_boxes : abs_y1 = int ( bounding_box [ "box_2d" ][ 0 ] / 1000 * height ) abs_x1 = int ( bounding_box [ "box_2d" ][ 1 ] / 1000 * width ) abs_y2 = int ( bounding_box [ "box_2d" ][ 2 ] / 1000 * height ) abs_x2 = int ( bounding_box [ "box_2d" ][ 3 ] / 1000 * width ) converted_bounding_boxes . append ([ abs_x1 , abs_y1 , abs_x2 , abs_y2 ]) print ( "Image size: " , width , height ) print ( "Bounding boxes:" , converted_bounding_boxes ) Note: The model also supports generating bounding boxes based on custom \ No newline at end of file diff --git a/docstore/fdadae30-1dbc-4013-a654-68a680a8781b b/docstore/fdadae30-1dbc-4013-a654-68a680a8781b new file mode 100644 index 0000000000000000000000000000000000000000..3f35d7c2ee0452cbbcb055812399e279fb8f7031 --- /dev/null +++ b/docstore/fdadae30-1dbc-4013-a654-68a680a8781b @@ -0,0 +1 @@ +$GEMINI_API_KEY " \ -X POST \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts":[{ "text": "TTS the following conversation between Joe and Jane: Joe: Hows it going today Jane? Jane: Not too bad, how about you?" }] }], "generationConfig": { "responseModalities": ["AUDIO"], "speechConfig": { "multiSpeakerVoiceConfig": { "speakerVoiceConfigs": [{ "speaker": "Joe", "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Kore" } } }, { "speaker": "Jane", "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Puck" } } }] } } }, "model": "gemini-2.5-flash-preview-tts", }' | jq -r '.candidates[0].content.parts[0].inlineData.data' | \ base64 --decode > out.pcm # You may need to install ffmpeg. ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm out.wav Controlling speech style with prompts You can control style, tone, accent, and pace using natural language prompts for both single- and multi-speaker TTS. For example, in a single-speaker prompt, you can say: Say in an spooky whisper: "By the pricking of my thumbs... Something wicked this way comes" In a multi-speaker prompt, provide the model with each speaker's name and corresponding transcript. You can also provide guidance for each speaker individually: Make Speaker1 sound tired and bored, and Speaker2 sound excited and happy: Speaker1: So... what's on the agenda today? Speaker2: You're never going to guess! Try using a voice option that corresponds to the style or emotion you want to convey, to emphasize it even more. In the previous prompt, for example, Enceladus 's breathiness might emphasize "tired" and "bored", while Puck 's upbeat tone could complement "excited" and "happy". Generating a prompt to convert to audio The TTS models only output audio, but you can use other models to generate a transcript first, then pass that transcript to the TTS model to read aloud. Python from google import genai from google.genai import types client = genai . Client () transcript = client . models . generate_content ( model \ No newline at end of file diff --git a/docstore/fdb282bf-d3f4-421d-a827-ffcd86ba646d b/docstore/fdb282bf-d3f4-421d-a827-ffcd86ba646d new file mode 100644 index 0000000000000000000000000000000000000000..1983a1b7b4b0634f95c028654d1fae0a75b50e6a --- /dev/null +++ b/docstore/fdb282bf-d3f4-421d-a827-ffcd86ba646d @@ -0,0 +1 @@ +{ mode : 'any' } } }; // Configure the client const ai = new GoogleGenAI ({}); // Create a chat session const chat = ai . chats . create ({ model : 'gemini-2.5-flash' , config : config }); const response = await chat . sendMessage ({ message : 'Turn this place into a party!' }); // Print out each of the function calls requested from this single call console . log ( "Example 1: Forced function calling" ); for ( const fn of response . functionCalls ) { const args = Object . entries ( fn . args ) . map (([ key , val ]) = > ` ${ key } = ${ val } ` ) . join ( ', ' ); console . log ( ` ${ fn . name } ( ${ args } )` ); } Each of the printed results reflects a single function call that the model has requested. To send the results back, include the responses in the same order as they were requested. The Python SDK supports automatic function calling , which automatically converts Python functions to declarations, handles the function call execution and response cycle for you. Following is an example for the disco use case. Note: Automatic Function Calling is a Python SDK only feature at the moment. Python from google import genai from google.genai import types # Actual function implementations def power_disco_ball_impl ( power : bool ) - > dict : """Powers the spinning disco ball. Args: power: Whether to turn the disco ball on or off. Returns: A status dictionary indicating the current state. """ return { "status" : f "Disco ball powered { 'on' if power else 'off' } " } def start_music_impl ( energetic : bool , loud : bool ) - > dict : """Play some music matching the specified parameters. Args: energetic: Whether the music is energetic or not. loud: Whether the music is loud or not. Returns: A dictionary containing the music settings. """ music_type = "energetic" if energetic else "chill" volume = "loud" if loud else "quiet" return { "music_type" : music_type , "volume" : volume } def dim_lights_impl ( brightness : float ) - > dict : """Dim the lights. Args: brightness: The \ No newline at end of file diff --git a/docstore/fde9dfd9-4fdd-4408-93a0-485544ac5055 b/docstore/fde9dfd9-4fdd-4408-93a0-485544ac5055 new file mode 100644 index 0000000000000000000000000000000000000000..39af00a0c9a9d748f5b17ca6db1ebff5b823541e --- /dev/null +++ b/docstore/fde9dfd9-4fdd-4408-93a0-485544ac5055 @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " Delete uploaded files Files are automatically deleted after 48 hours. You can also manually delete an uploaded file: Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) client . files . delete ( name = myfile . name ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; await ai . files . delete ({ name : fileName }); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } client . DeleteFile ( ctx , file . Name ) REST curl --request "DELETE" https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " Usage info You can use the Files API to upload and interact with media files. The Files API lets you store up to 20 GB of files per project, with a per-file maximum size of 2 GB. Files are stored for 48 hours. During that time, you can use the API to get metadata about the files, but you can't download the files. The Files API is available at no cost in all regions where the Gemini API is available. File prompting strategies This section provides guidance and best practices for using media files with prompts for the Gemini API. Being able to use various types of data in your prompts gives you more flexibility in terms of what tasks you can tackle with the Gemini API. For example, you can send the model a photo of a delicious meal and ask it to write a short blog about the meal. Prompt Response Write a short, engaging blog post based on this picture. It should include a description of the meal in the photo and talk about my journey meal prepping. Meal prepping is a great way to save time and money, and it can also help you to eat healthier. This meal is a great example of a healthy and delicious meal that can be easily prepped ahead of time. This \ No newline at end of file diff --git a/docstore/fdf6bf0e-cc12-418c-a863-013b32297e81 b/docstore/fdf6bf0e-cc12-418c-a863-013b32297e81 new file mode 100644 index 0000000000000000000000000000000000000000..2744c4adf6a3757935f9239a6f4017c796d018fb --- /dev/null +++ b/docstore/fdf6bf0e-cc12-418c-a863-013b32297e81 @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/safety-guidance Title: Safety guidance | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/fe04055d-b21f-4095-adce-060aad9c7535 b/docstore/fe04055d-b21f-4095-adce-060aad9c7535 new file mode 100644 index 0000000000000000000000000000000000000000..5a2b8a2c7253b87796a50aae4616794fa40b7018 --- /dev/null +++ b/docstore/fe04055d-b21f-4095-adce-060aad9c7535 @@ -0,0 +1 @@ +( 'Opened' ); }, onmessage : function ( message ) { console . debug ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send content... session . close (); } main (); Note: You can only set one modality in the response_modalities field. This means that you can configure the model to respond with either text or audio, but not both in the same session. Interaction modalities The following sections provide examples and supporting context for the different input and output modalities available in Live API. Sending and receiving text Here's how you can send and receive text: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello, how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . text is not None : print ( response . text , end = "" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . \ No newline at end of file diff --git a/docstore/fe139c75-0fab-4e15-97dd-6867476261fa b/docstore/fe139c75-0fab-4e15-97dd-6867476261fa new file mode 100644 index 0000000000000000000000000000000000000000..8466b571c67a82ae45981f82366ef1fa006665ef --- /dev/null +++ b/docstore/fe139c75-0fab-4e15-97dd-6867476261fa @@ -0,0 +1 @@ +gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa \ No newline at end of file diff --git a/docstore/fe4df35e-0a04-48fd-b247-4aa563467414 b/docstore/fe4df35e-0a04-48fd-b247-4aa563467414 new file mode 100644 index 0000000000000000000000000000000000000000..6e71e94222e9c44768c28e09ebada72b5ff1e76f --- /dev/null +++ b/docstore/fe4df35e-0a04-48fd-b247-4aa563467414 @@ -0,0 +1 @@ +writeFileSync ( `imagen- ${ idx } .png` , buffer ); idx ++ ; } } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateImagesConfig { NumberOfImages : 4 , } response , _ := client . Models . GenerateImages ( ctx , "imagen-4.0-generate-preview-06-06" , "Robot holding a red skateboard" , config , ) for n , image := range response . GeneratedImages { fname := fmt . Sprintf ( "imagen-%d.png" , n ) _ = os . WriteFile ( fname , image . Image . ImageBytes , 0644 ) } } REST curl -X POST \ "https://generativelanguage.googleapis.com/v1beta/models/imagen-4.0-generate-preview-06-06:predict" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -d '{ "instances": [ { "prompt": "Robot holding a red skateboard" } ], "parameters": { "sampleCount": 4 } }' AI-generated image of a robot holding a red skateboard Imagen configuration Imagen supports English only prompts at this time and the following parameters: Note: Naming conventions of parameters vary by programming language. numberOfImages : The number of images to generate, from 1 to 4 (inclusive). The default is 4. For Imagen 4 Ultra, it defaults to 1 as only one image can be generated at a time. aspectRatio : Changes the aspect ratio of the generated image. Supported values are "1:1" , "3:4" , "4:3" , "9:16" , and "16:9" . The default is "1:1" . personGeneration : Allow the model to generate images of people. The following values are supported: "dont_allow" : Block generation of images of people. "allow_adult" : Generate images of adults, but not children. This is the default. "allow_all" : Generate images that include adults and children. Note: The "allow_all" parameter value is not allowed in EU, UK, CH, MENA locations. Choosing the right model Choose Gemini when: You need contextually relevant images that leverage \ No newline at end of file diff --git a/docstore/fe5a5118-4334-4879-bddb-070d33032e12 b/docstore/fe5a5118-4334-4879-bddb-070d33032e12 new file mode 100644 index 0000000000000000000000000000000000000000..0d6f2bf32b918d3f85636fdb1c53070e3d5509f6 --- /dev/null +++ b/docstore/fe5a5118-4334-4879-bddb-070d33032e12 @@ -0,0 +1 @@ +overlay_filename = f " { item [ 'label' ] } _ { i } _overlay.png" mask . save ( os . path . join ( output_dir , mask_filename )) # Create and save overlay composite = Image . alpha_composite ( im . convert ( 'RGBA' ), overlay ) composite . save ( os . path . join ( output_dir , overlay_filename )) print ( f "Saved mask and overlay for { item [ 'label' ] } to { output_dir } " ) # Example usage if __name__ == "__main__" : extract_segmentation_masks ( "path/to/image.png" ) Check the segmentation example in the cookbook guide for a more detailed example. An example segmentation output with objects and segmentation masks Supported image formats Gemini supports the following image format MIME types: PNG - image/png JPEG - image/jpeg WEBP - image/webp HEIC - image/heic HEIF - image/heif Capabilities All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation. Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements. Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities: Gemini 2.0 models are further trained to support enhanced object detection . Gemini 2.5 models are further trained to support enhanced segmentation in addition to object detection . Limitations and key technical information File limit Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request. Token calculation Gemini 1.5 Flash and Gemini 1.5 Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens. Gemini 2.0 Flash and Gemini 2.5 Flash/Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each \ No newline at end of file diff --git a/docstore/fe6e5ef1-6b4c-4b5b-a5a2-87741054813a b/docstore/fe6e5ef1-6b4c-4b5b-a5a2-87741054813a new file mode 100644 index 0000000000000000000000000000000000000000..750217269b5a3e53dd2d92de822b68418a6799df --- /dev/null +++ b/docstore/fe6e5ef1-6b4c-4b5b-a5a2-87741054813a @@ -0,0 +1 @@ +' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Get metadata for a file You can verify that the API successfully stored the uploaded file and get its metadata by calling files.get . Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) file_name = myfile . name myfile = client . files . get ( name = file_name ) print ( myfile ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; const fetchedFile = await ai . files . get ({ name : fileName }); console . log ( fetchedFile ); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } gotFile , err := client . GetFile ( ctx , file . Name ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Got file:" , gotFile . Name ) REST # file_info.json was created in the upload example name = $( jq ".file.name" file_info.json ) # Get the file of interest to check state curl https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " > file_info.json # Print some information about the file you got name = $( jq ".file.name" file_info.json ) echo name = $name file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri List uploaded files You can upload multiple files using the Files API. The following code gets a list of all the files uploaded: Python print ( 'My files:' ) for f in client . files . list (): print ( ' ' , f . name ) JavaScript const listResponse = await ai . files . list ({ config : { pageSize : 10 } }); for await ( const file of listResponse ) { console . log ( file . name ); } Go iter := client . ListFiles ( ctx ) for { ifile , err := iter . Next () if err == iterator . Done { break } if err != nil { log . Fatal ( err ) } fmt . Println ( ifile . Name ) } REST echo "My files: " curl \ No newline at end of file diff --git a/docstore/fe7d4d6a-e567-4920-b889-a5fef0c94f33 b/docstore/fe7d4d6a-e567-4920-b889-a5fef0c94f33 new file mode 100644 index 0000000000000000000000000000000000000000..5db8bfd80021d23905dc72cb076a591abebf4c74 --- /dev/null +++ b/docstore/fe7d4d6a-e567-4920-b889-a5fef0c94f33 @@ -0,0 +1 @@ +Tahoe?" using the Get Weather example: Text part [{ "candidates" : [ { "content" : { "parts" : [ { "text" : "Here's what the weather in Lake Tahoe is today" , "thoughtSignature" : "ClcBVKhc7ru7KzUI7SrdUoIdAYLm/+i93aHjfIt4xHyAoO/G70tApxnK2ujBhOhC1PrRy1pkQa88fqFvpHNVd1HDjNLO7mkp6/hFwE+SPPEB3fh0hs4oM8MKhgIBVKhc7uIGvrS7i/T4HpfbnYrluFfWNjZ62gewqe4cVdR/Dlh+zbjtYmDD0gPZ+SuBO7vvHQdzsjePRP+2Y5XddX6LEf/cGGgakq8EhVvw/a6IVzUO6XmpHg2Ag1sl8E9+VFH/lC0R0ZuYdFWligtDuYwp5p5q3o59G0TtWeU2MC1y2MJfE9u/KWd313ldka80/X2W/xF2O/4djMp5G2WKcULfve75zeRCy0mc5iS3SB9mTH0cT6x0vtKjeBx50gcg+CQWtJcRuwTVzz54dmvmK9xvnqA8gKGw3DuaM9wfy5hyY7Qg0z3iyyWdP8T/lbjKim8IEQOk7O1vVwP1Ko7oMYH8JgA1CsoBAVSoXO6v4c5RSyd1cn6EIU0pEFQsjW7rYWPuZdOFq/tsGJT9BCfW7KGkPGwlNSq8jTJFvbcJ/DjtndISQYXwiXd2kGa5JfdS2Kh4zOxCxiWtOk+2nCc3+XQk2nonhO+esGJpkDdbbHZSqRgcUtYKq7q28iPFOQvOFyCiZNB7K86Z/6Hnagu2snSlN/BcTMaFGaWpcCClSUo4foRZn3WbNCoM8rcpD7qEJMp4a5baaSxyyeL1ZTGd2HLpFys/oiW6e3oAnhxuIysCwg==" } ] , "role" : "model" } , "index" : 0 } ] , # Remainder of response... Function call part [{ "candidates" : [ { "content" : { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : "CiwBVKhc7nRyTi3HmggPD9iQiRc261f5jwuMdw3H/itDH0emsb9ZVo3Nwx9p6wpsAVSoXO5i8fDV4jBSBLoaWxB5zUdlGY6aIGp+I0oEnwRRSRQ1LOvrDlojEH8JE8HjiKXALdJrvNPiG+HY3GZEO8pZjEZtc3UoBUh7+SVyjK7Xolu7aRYYeUyzrCapoETWypER1jbrJXnFV23hCosBAVSoXO6oIPNJSmbuEDfGafOhuCSHkpr1yjTp35RXYqmCESzRzWf5+nFXLqncqeFo4ohoxbiYQVpVQbOZF81p8o9zg6xeRE7qMeOv+XN7enXGJ4/s3qNFQpfkSMqRdBITN1VpX7jyfEAjvxBNc7PDfDJZmEPY338ZIY5nFFcmzJSWjVrboFt2sMFv+A==" } ] , "role" : "model" } , "finishReason" : "STOP" , "index" : 0 } ] , # Remainder of response... You can confirm that you received a signature and see what a signature looks like using the following code: # Step 2: Call the model with function declarations # ...Generation config, Configure the client, and Define user prompt (No changes) # Send request with declarations (using a thinking model) response = client . \ No newline at end of file diff --git a/docstore/fe8593f4-ac91-4150-9c86-c4ca2b22a623 b/docstore/fe8593f4-ac91-4150-9c86-c4ca2b22a623 new file mode 100644 index 0000000000000000000000000000000000000000..b56dfbca2298dddb94eb66ba2a6b5e2a4d7109fa --- /dev/null +++ b/docstore/fe8593f4-ac91-4150-9c86-c4ca2b22a623 @@ -0,0 +1 @@ +"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "' " ${ MIME1_TYPE } " '", "file_uri": ' $file1_uri '}}, { "inline_data": { "mime_type":"' " ${ MIME2_TYPE } " '", "data": "' " $IMAGE2_BASE64 " '" } } ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Object detection From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. Python from google import genai from google.genai import types from PIL import Image import json client = genai . Client () prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image . open ( "/path/to/image.png" ) config = types . GenerateContentConfig ( response_mime_type = "application/json" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , prompt ], config = config ) width , height = image . size bounding_boxes = json . loads ( response . text ) converted_bounding_boxes = [] for bounding_box in bounding_boxes : abs_y1 = int ( bounding_box [ "box_2d" ][ 0 ] / 1000 * height ) abs_x1 = int ( bounding_box [ "box_2d" ][ 1 ] / 1000 * width ) abs_y2 = int ( bounding_box [ "box_2d" ][ 2 ] / 1000 * height ) abs_x2 = int ( bounding_box [ "box_2d" ][ 3 ] / 1000 * width ) converted_bounding_boxes . append ([ abs_x1 , abs_y1 , abs_x2 , abs_y2 ]) print ( "Image size: " , width , height ) print ( "Bounding boxes:" , converted_bounding_boxes ) Note: The model also supports generating bounding boxes based on custom \ No newline at end of file diff --git a/docstore/fe953e48-478b-47a7-8841-36c61b5d10b8 b/docstore/fe953e48-478b-47a7-8841-36c61b5d10b8 new file mode 100644 index 0000000000000000000000000000000000000000..facd4f50718f089582ed06a7b9e2ea144ce56d9c --- /dev/null +++ b/docstore/fe953e48-478b-47a7-8841-36c61b5d10b8 @@ -0,0 +1 @@ +instructions, such as: "Show bounding boxes of all green objects in this image". It also support custom labels like "label the items with the allergens they can contain". For more examples, check following notebooks in the Gemini Cookbook : 2D spatial understanding notebook Experimental 3D pointing notebook Segmentation Starting with Gemini 2.5, models not only detect items but also segment them and provide their contour masks. The model predicts a JSON list, where each item represents a segmentation mask. Each item has a bounding box (" box_2d ") in the format [y0, x0, y1, x1] with normalized coordinates between 0 and 1000, a label (" label ") that identifies the object, and finally the segmentation mask inside the bounding box, as base64 encoded png that is a probability map with values between 0 and 255. The mask needs to be resized to match the bounding box dimensions, then binarized at your confidence threshold (127 for the midpoint). Note: For better results, disable thinking by setting the thinking budget to 0. See code sample below for an example. Python from google import genai from google.genai import types from PIL import Image , ImageDraw import io import base64 import json import numpy as np import os client = genai . Client () def parse_json ( json_output : str ): # Parsing out the markdown fencing lines = json_output . splitlines () for i , line in enumerate ( lines ): if line == "```json" : json_output = " \n " . join ( lines [ i + 1 :]) # Remove everything before "```json" output = json_output . split ( "```" )[ 0 ] # Remove everything after the closing "```" break # Exit the loop once "```json" is found return json_output def extract_segmentation_masks ( image_path : str , output_dir : str = "segmentation_outputs" ): # Load and resize image im = Image . open ( image_path ) im . thumbnail ([ 1024 , 1024 ], Image . Resampling . LANCZOS ) prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks \ No newline at end of file diff --git a/docstore/fec47aaf-6eb8-429a-ad1a-6c8d985e4d09 b/docstore/fec47aaf-6eb8-429a-ad1a-6c8d985e4d09 new file mode 100644 index 0000000000000000000000000000000000000000..1f747d7032d2c1e3fe25a9d1d2b24965593e287b --- /dev/null +++ b/docstore/fec47aaf-6eb8-429a-ad1a-6c8d985e4d09 @@ -0,0 +1 @@ +Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/docstore/fed2fe2f-cd1e-423a-9e24-9652d050377e b/docstore/fed2fe2f-cd1e-423a-9e24-9652d050377e new file mode 100644 index 0000000000000000000000000000000000000000..ba28dbc7ce7bec083cb930373e6fd594666aa933 --- /dev/null +++ b/docstore/fed2fe2f-cd1e-423a-9e24-9652d050377e @@ -0,0 +1 @@ +my house?" , }); console . log ( "Chat response 2:" , response2 . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) res , _ := chat . SendMessage ( ctx , genai . Part { Text : "How many paws are in my house?" }) if len ( res . Candidates ) > 0 { fmt . Println ( res . Candidates [ 0 ]. Content . Parts [ 0 ]. Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' \ No newline at end of file diff --git a/docstore/fef2f7eb-05ce-4763-b848-7eff0d54a277 b/docstore/fef2f7eb-05ce-4763-b848-7eff0d54a277 new file mode 100644 index 0000000000000000000000000000000000000000..3c1b7a7f28e24faa230d65b7608a5733d7c64407 --- /dev/null +++ b/docstore/fef2f7eb-05ce-4763-b848-7eff0d54a277 @@ -0,0 +1 @@ +single prompt by including multiple image Part objects in the contents array. These can be a mix of inline data (local files or URLs) and File API references. Python from google import genai from google.genai import types client = genai . Client () # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client . files . upload ( file = image1_path ) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open ( image2_path , 'rb' ) as f : img2_bytes = f . read () # Create the prompt with text and multiple images response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is different between these two images?" , uploaded_file , # Use the uploaded file reference types . Part . from_bytes ( data = img2_bytes , mime_type = 'image/png' ) ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); async function main () { // Upload the first image const image1_path = "path/to/image1.jpg" ; const uploadedFile = await ai . files . upload ({ file : image1_path , config : { mimeType : "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png" ; const base64Image2File = fs . readFileSync ( image2_path , { encoding : "base64" , }); // Create the prompt with text and multiple images const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ "What is different between these two images?" , createPartFromUri ( uploadedFile . uri , uploadedFile . mimeType ), { inlineData : { mimeType : "image/png" , data : base64Image2File , }, }, ]), }); console . log ( response . text ); } await main (); Go // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile , _ := client . Files . UploadFromPath ( ctx , image1Path , nil ) // Prepare the second image as inline \ No newline at end of file diff --git a/docstore/feff18be-567c-4052-9986-2e5c99cc876f b/docstore/feff18be-567c-4052-9986-2e5c99cc876f new file mode 100644 index 0000000000000000000000000000000000000000..cc60911e84f69efe9aed5a36f728569e3615d06d --- /dev/null +++ b/docstore/feff18be-567c-4052-9986-2e5c99cc876f @@ -0,0 +1 @@ +image_file . read ()) . decode ( 'utf-8' ) # Getting the base64 string base64_image = encode_image ( "Path/to/agi/image.jpeg" ) response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = [ { "role" : "user" , "content" : [ { "type" : "text" , "text" : "What is in this image?" , }, { "type" : "image_url" , "image_url" : { "url" : f "data:image/jpeg;base64, { base64_image } " }, }, ], } ], ) print ( response . choices [ 0 ]) JavaScript import OpenAI from "openai" ; import fs from 'fs/promises' ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function encodeImage ( imagePath ) { try { const imageBuffer = await fs . readFile ( imagePath ); return imageBuffer . toString ( 'base64' ); } catch ( error ) { console . error ( "Error encoding image:" , error ); return null ; } } async function main () { const imagePath = "Path/to/agi/image.jpeg" ; const base64Image = await encodeImage ( imagePath ); const messages = [ { "role" : "user" , "content" : [ { "type" : "text" , "text" : "What is in this image?" , }, { "type" : "image_url" , "image_url" : { "url" : `data:image/jpeg;base64, ${ base64Image } ` }, }, ], } ]; try { const response = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : messages , }); console . log ( response . choices [ 0 ]); } catch ( error ) { console . error ( "Error calling Gemini API:" , error ); } } main (); REST bash -c ' base64_image=$(base64 -i "Path/to/agi/image.jpeg"); curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d "{ \"model\": \"gemini-2.0-flash\", \"messages\": [ { \"role\": \"user\", \"content\": [ { \"type\": \"text\", \"text\": \"What is in this image?\" }, { \"type\": \"image_url\", \"image_url\": { \"url\": \"data:image/jpeg;base64,${base64_image}\" } } ] } ] }" ' \ No newline at end of file diff --git a/docstore/ff06983a-a24f-438f-9595-1bfb8c950b19 b/docstore/ff06983a-a24f-438f-9595-1bfb8c950b19 new file mode 100644 index 0000000000000000000000000000000000000000..cc60911e84f69efe9aed5a36f728569e3615d06d --- /dev/null +++ b/docstore/ff06983a-a24f-438f-9595-1bfb8c950b19 @@ -0,0 +1 @@ +image_file . read ()) . decode ( 'utf-8' ) # Getting the base64 string base64_image = encode_image ( "Path/to/agi/image.jpeg" ) response = client . chat . completions . create ( model = "gemini-2.0-flash" , messages = [ { "role" : "user" , "content" : [ { "type" : "text" , "text" : "What is in this image?" , }, { "type" : "image_url" , "image_url" : { "url" : f "data:image/jpeg;base64, { base64_image } " }, }, ], } ], ) print ( response . choices [ 0 ]) JavaScript import OpenAI from "openai" ; import fs from 'fs/promises' ; const openai = new OpenAI ({ apiKey : "GEMINI_API_KEY" , baseURL : "https://generativelanguage.googleapis.com/v1beta/openai/" }); async function encodeImage ( imagePath ) { try { const imageBuffer = await fs . readFile ( imagePath ); return imageBuffer . toString ( 'base64' ); } catch ( error ) { console . error ( "Error encoding image:" , error ); return null ; } } async function main () { const imagePath = "Path/to/agi/image.jpeg" ; const base64Image = await encodeImage ( imagePath ); const messages = [ { "role" : "user" , "content" : [ { "type" : "text" , "text" : "What is in this image?" , }, { "type" : "image_url" , "image_url" : { "url" : `data:image/jpeg;base64, ${ base64Image } ` }, }, ], } ]; try { const response = await openai . chat . completions . create ({ model : "gemini-2.0-flash" , messages : messages , }); console . log ( response . choices [ 0 ]); } catch ( error ) { console . error ( "Error calling Gemini API:" , error ); } } main (); REST bash -c ' base64_image=$(base64 -i "Path/to/agi/image.jpeg"); curl "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer GEMINI_API_KEY" \ -d "{ \"model\": \"gemini-2.0-flash\", \"messages\": [ { \"role\": \"user\", \"content\": [ { \"type\": \"text\", \"text\": \"What is in this image?\" }, { \"type\": \"image_url\", \"image_url\": { \"url\": \"data:image/jpeg;base64,${base64_image}\" } } ] } ] }" ' \ No newline at end of file diff --git a/docstore/ff1cef40-3175-43e9-9ff1-c9e0a25b7b30 b/docstore/ff1cef40-3175-43e9-9ff1-c9e0a25b7b30 new file mode 100644 index 0000000000000000000000000000000000000000..c651df5a8dc16b6ecacfcfbb978b8a98fa66456b --- /dev/null +++ b/docstore/ff1cef40-3175-43e9-9ff1-c9e0a25b7b30 @@ -0,0 +1 @@ +Live API capabilities guide | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Live API capabilities guide Preview: The Live API is in preview. This is a comprehensive guide that covers capabilities and configurations available with the Live API. See Get started with Live API page for a overview and sample code for common use cases. Before you begin Familiarize yourself with core concepts: If you haven't already done so, read the Get started with Live API page first. This will introduce you to the fundamental principles of the Live API, how it works, and the distinction between the different models and their corresponding audio generation methods ( native audio or half-cascade). Try the Live API in AI Studio: You may find it useful to try the Live API in Google AI Studio before you start building. To use the Live API in Google AI Studio, select Stream . Establishing a connection The following example shows how to create a connection with an API key: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : print ( "Session started" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function main () { const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug \ No newline at end of file diff --git a/docstore/ff2f7fcd-cb83-4046-835d-a7900e34a231 b/docstore/ff2f7fcd-cb83-4046-835d-a7900e34a231 new file mode 100644 index 0000000000000000000000000000000000000000..a8da0e92acb39cd3b0a95b76d425835072520bc1 --- /dev/null +++ b/docstore/ff2f7fcd-cb83-4046-835d-a7900e34a231 @@ -0,0 +1 @@ +Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/docstore/ff538afb-37b5-4e53-8424-ebddf4b8abdb b/docstore/ff538afb-37b5-4e53-8424-ebddf4b8abdb new file mode 100644 index 0000000000000000000000000000000000000000..5ca0fbf36f5c2d156e608b6f1a4c1aa0ccc73d8c --- /dev/null +++ b/docstore/ff538afb-37b5-4e53-8424-ebddf4b8abdb @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/vision#main-content Title: Image understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/ff69d17d-ff91-47ab-9683-f2655adf8f4b b/docstore/ff69d17d-ff91-47ab-9683-f2655adf8f4b new file mode 100644 index 0000000000000000000000000000000000000000..e7df3841ca47d093238d5ecfeb9f7cc95942f8e5 --- /dev/null +++ b/docstore/ff69d17d-ff91-47ab-9683-f2655adf8f4b @@ -0,0 +1 @@ +data image2Path := "path/to/image2.jpeg" imgBytes , _ := os . ReadFile ( image2Path ) parts := [] * genai . Part { genai . NewPartFromText ( "What is different between these two images?" ), genai . NewPartFromBytes ( imgBytes , "image/jpeg" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST # Upload the first image IMAGE1_PATH = "path/to/image1.jpg" MIME1_TYPE = $( file -b --mime-type " ${ IMAGE1_PATH } " ) NUM1_BYTES = $( wc -c < " ${ IMAGE1_PATH } " ) DISPLAY_NAME1 = IMAGE1 tmp_header_file1 = upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME1_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME1 } '}}" 2 > /dev/null upload_url1 = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file1 } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file1 } " curl " ${ upload_url1 } " \ -H "Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE1_PATH } " 2 > /dev/null > file_info1.json file1_uri = $( jq ".file.uri" file_info1.json ) echo file1_uri = $file1_uri # Prepare the second image (inline) IMAGE2_PATH = "path/to/image2.png" MIME2_TYPE = $( file -b --mime-type " ${ IMAGE2_PATH } " ) if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMAGE2_BASE64 = $( base64 $B64FLAGS $IMAGE2_PATH ) # Now generate content using both images curl \ No newline at end of file diff --git a/docstore/ff771bdc-fb7c-4d85-bbf7-f23263faecb8 b/docstore/ff771bdc-fb7c-4d85-bbf7-f23263faecb8 new file mode 100644 index 0000000000000000000000000000000000000000..4ec402bc23287719ce34da8c619b76bb78fda53d --- /dev/null +++ b/docstore/ff771bdc-fb7c-4d85-bbf7-f23263faecb8 @@ -0,0 +1 @@ +Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 \ No newline at end of file diff --git a/docstore/ff7d1074-3f46-46cb-8995-dfb834df6bd8 b/docstore/ff7d1074-3f46-46cb-8995-dfb834df6bd8 new file mode 100644 index 0000000000000000000000000000000000000000..fc2e63c0fbe73768c251346fd94b01e59289b693 --- /dev/null +++ b/docstore/ff7d1074-3f46-46cb-8995-dfb834df6bd8 @@ -0,0 +1 @@ +Additional usage policies | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Additional usage policies This page includes additional usage policies for the Gemini API. Abuse monitoring Google is committed to the responsible development and use of AI. To ensure the safety and integrity of the Gemini API, we have created these policy guidelines. By using the Gemini API, you agree to the following guidelines, the Gemini API Additional Terms of Service and Generative AI Prohibited Use Policy . How We Monitor for Misuse Google's Trust and Safety Team employs a combination of automated and manual processes to detect potential misuse of the Gemini API and enforce our policies. Automated Detection: Automated systems scan API usage for violations of our Prohibited Use Policy, such as hate speech, harassment, sexually explicit content, and dangerous content. Manual Detection: If a project consistently exhibits suspicious activity, it may be flagged for manual review by authorized Google personnel. How We Handle Data To help with abuse monitoring, Google retains the following data for fifty-five (55) days: Prompts: The text prompts you submit to the API. Contextual Information: Any additional context you provide with your prompts. Output: The responses generated by the Gemini API. How We Investigate Potential Issues When prompts or model outputs are flagged by safety filters and abuse detection systems described above, authorized Google employees may assess the flagged content, and either confirm or correct the classification or determination based on predefined guidelines and policies. Data can be accessed for human review only by authorized \ No newline at end of file diff --git a/docstore/ff97c6b1-e9f8-40ed-aba3-f1757e7814a8 b/docstore/ff97c6b1-e9f8-40ed-aba3-f1757e7814a8 new file mode 100644 index 0000000000000000000000000000000000000000..c19e2d1f2a8624bd1e7529d33c658d3154e40942 --- /dev/null +++ b/docstore/ff97c6b1-e9f8-40ed-aba3-f1757e7814a8 @@ -0,0 +1 @@ +response. A token is approximately four characters. 100 tokens correspond to roughly 60-80 words. Temperature: The temperature controls the degree of randomness in token selection. The temperature is used for sampling during response generation, which occurs when topP and topK are applied. Lower temperatures are good for prompts that require a more deterministic or less open-ended response, while higher temperatures can lead to more diverse or creative results. A temperature of 0 is deterministic, meaning that the highest probability response is always selected. topK : The topK parameter changes how the model selects tokens for output. A topK of 1 means the selected token is the most probable among all the tokens in the model's vocabulary (also called greedy decoding), while a topK of 3 means that the next token is selected from among the 3 most probable using the temperature. For each token selection step, the topK tokens with the highest probabilities are sampled. Tokens are then further filtered based on topP with the final token selected using temperature sampling. topP : The topP parameter changes how the model selects tokens for output. Tokens are selected from the most to least probable until the sum of their probabilities equals the topP value. For example, if tokens A, B, and C have a probability of 0.3, 0.2, and 0.1 and the topP value is 0.5, then the model will select either A or B as the next token by using the temperature and exclude C as a candidate. The default topP value is 0.95. stop_sequences : Set a stop sequence to tell the model to stop generating content. A stop sequence can be any sequence of characters. Try to avoid using a sequence of characters that may appear in the generated content. Prompt iteration strategies Prompt design can sometimes require a few iterations before you consistently get the response you're looking for. This section provides guidance on some things you can try when iterating on your prompts: Use different phrasing: \ No newline at end of file diff --git a/docstore/ff9e23ee-9dd6-4e33-9c9d-cdbd9b6f967c b/docstore/ff9e23ee-9dd6-4e33-9c9d-cdbd9b6f967c new file mode 100644 index 0000000000000000000000000000000000000000..e49b6d32b620c4bcec26adfb8021d682612c0f7e --- /dev/null +++ b/docstore/ff9e23ee-9dd6-4e33-9c9d-cdbd9b6f967c @@ -0,0 +1 @@ +URL: https://ai.google.dev/gemini-api/docs/image-understanding Title: Image understanding | Gemini API | Google AI for Developers ================================================== \ No newline at end of file diff --git a/docstore/ffa76595-13a4-4e31-8b36-39d284ea4a38 b/docstore/ffa76595-13a4-4e31-8b36-39d284ea4a38 new file mode 100644 index 0000000000000000000000000000000000000000..5f25eb2a53a9afab2cc27675039b1ff3f0e2b594 --- /dev/null +++ b/docstore/ffa76595-13a4-4e31-8b36-39d284ea4a38 @@ -0,0 +1 @@ +suitable for production use. Review ephemeral tokens guide for more information. Consider adding restrictions to your key: You can limit a key's permissions by adding API key restrictions . This minimizes the potential damage if the key is ever leaked. For some general best practices, you can also review this support article . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/docstore/ffd73aea-9211-4ce4-843a-056244a22a4a b/docstore/ffd73aea-9211-4ce4-843a-056244a22a4a new file mode 100644 index 0000000000000000000000000000000000000000..c651df5a8dc16b6ecacfcfbb978b8a98fa66456b --- /dev/null +++ b/docstore/ffd73aea-9211-4ce4-843a-056244a22a4a @@ -0,0 +1 @@ +Live API capabilities guide | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Live API capabilities guide Preview: The Live API is in preview. This is a comprehensive guide that covers capabilities and configurations available with the Live API. See Get started with Live API page for a overview and sample code for common use cases. Before you begin Familiarize yourself with core concepts: If you haven't already done so, read the Get started with Live API page first. This will introduce you to the fundamental principles of the Live API, how it works, and the distinction between the different models and their corresponding audio generation methods ( native audio or half-cascade). Try the Live API in AI Studio: You may find it useful to try the Live API in Google AI Studio before you start building. To use the Live API in Google AI Studio, select Stream . Establishing a connection The following example shows how to create a connection with an API key: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : print ( "Session started" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function main () { const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug \ No newline at end of file diff --git a/docstore/ffe1304d-3c27-41be-b864-c2ffb79b7f4a b/docstore/ffe1304d-3c27-41be-b864-c2ffb79b7f4a new file mode 100644 index 0000000000000000000000000000000000000000..34fafa88bef1190b729bdf255b8c99cfcd7b08b1 --- /dev/null +++ b/docstore/ffe1304d-3c27-41be-b864-c2ffb79b7f4a @@ -0,0 +1 @@ +Use descriptive language : Use adjectives and adverbs to paint a clear picture for Veo. Provide context : If necessary, include background information to help your model understand what you want. Reference specific artistic styles : If you have a particular aesthetic in mind, reference specific artistic styles or art movements. Utilize prompt engineering tools : Consider exploring prompt engineering tools or resources to help you refine your prompts and achieve optimal results. For more information, visit Introduction to prompt design . Enhance the facial details in your personal and group images : Specify facial details as a focus of the photo like using the word portrait in the prompt. Example prompts and output This section presents several prompts, highlighting how descriptive details can elevate the outcome of each video. Icicles This video demonstrates how you can use the elements of prompt writing basics in your prompt. Prompt Generated output Close up shot (composition) of melting icicles (subject) on a frozen rock wall (context) with cool blue tones (ambiance), zoomed in (camera motion) maintaining close-up detail of water drips (action). Man on the phone These videos demonstrate how you can revise your prompt with increasingly specific details to get Veo to refine the output to your liking. Prompt Generated output Analysis The camera dollies to show a close up of a desperate man in a green trench coat. He's making a call on a rotary-style wall phone with a green neon light. It looks like a movie scene. This is the first generated video based on the prompt. A close-up cinematic shot follows a desperate man in a weathered green trench coat as he dials a rotary phone mounted on a gritty brick wall, bathed in the eerie glow of a green neon sign. The camera dollies in, revealing the tension in his jaw and the desperation etched on his face as he struggles to make the call. The shallow depth of field focuses on his furrowed brow and the black rotary phone, \ No newline at end of file diff --git a/docstore/ffeaa566-5c26-413c-b588-6033403e969f b/docstore/ffeaa566-5c26-413c-b588-6033403e969f new file mode 100644 index 0000000000000000000000000000000000000000..13654164ec6e7048af241ec2efbc514257c312b5 --- /dev/null +++ b/docstore/ffeaa566-5c26-413c-b588-6033403e969f @@ -0,0 +1 @@ +For alternative methods of providing images and more advanced image processing, see our image understanding guide . The API also supports document , video , and audio inputs and understanding. Streaming responses By default, the model returns a response only after the entire generation process is complete. For more fluid interactions, use streaming to receive GenerateContentResponse instances incrementally as they're generated. Python from google import genai client = genai . Client () response = client . models . generate_content_stream ( model = "gemini-2.5-flash" , contents = [ "Explain how AI works" ] ) for chunk in response : print ( chunk . text , end = "" ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , }); for await ( const chunk of response ) { console . log ( chunk . text ); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } stream := client . Models . GenerateContentStream ( ctx , "gemini-2.5-flash" , genai . Text ( "Write a story about a magic backpack." ), nil , ) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt=sse" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ --no-buffer \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main \ No newline at end of file diff --git a/huggingface_tokenizers_cache/.locks/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/3c1d04911c269b925af977a3151c9704e990e4d0.lock b/huggingface_tokenizers_cache/.locks/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/3c1d04911c269b925af977a3151c9704e990e4d0.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/huggingface_tokenizers_cache/.locks/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b.lock b/huggingface_tokenizers_cache/.locks/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/huggingface_tokenizers_cache/.locks/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/df9f7c982cc0add6ab016b5d8e42e2609f59dec8.lock b/huggingface_tokenizers_cache/.locks/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/df9f7c982cc0add6ab016b5d8e42e2609f59dec8.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/.no_exist/26672447463f314a180021ded1522a55ce5b1090/added_tokens.json b/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/.no_exist/26672447463f314a180021ded1522a55ce5b1090/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/.no_exist/26672447463f314a180021ded1522a55ce5b1090/chat_template.jinja b/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/.no_exist/26672447463f314a180021ded1522a55ce5b1090/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/.no_exist/26672447463f314a180021ded1522a55ce5b1090/tokenizer.model b/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/.no_exist/26672447463f314a180021ded1522a55ce5b1090/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/blobs/3c1d04911c269b925af977a3151c9704e990e4d0 b/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/blobs/3c1d04911c269b925af977a3151c9704e990e4d0 new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/blobs/3c1d04911c269b925af977a3151c9704e990e4d0 @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/blobs/6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b b/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/blobs/6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/blobs/6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/blobs/df9f7c982cc0add6ab016b5d8e42e2609f59dec8 b/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/blobs/df9f7c982cc0add6ab016b5d8e42e2609f59dec8 new file mode 100644 index 0000000000000000000000000000000000000000..df9f7c982cc0add6ab016b5d8e42e2609f59dec8 --- /dev/null +++ b/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/blobs/df9f7c982cc0add6ab016b5d8e42e2609f59dec8 @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "left", + "tokenizer_class": "PreTrainedTokenizer", + "unk_token": null +} diff --git a/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/refs/main b/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/refs/main new file mode 100644 index 0000000000000000000000000000000000000000..a9706bda2f4d7b7ec75197149f2c5e7ca26e390e --- /dev/null +++ b/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/refs/main @@ -0,0 +1 @@ +26672447463f314a180021ded1522a55ce5b1090 \ No newline at end of file diff --git a/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/snapshots/26672447463f314a180021ded1522a55ce5b1090/special_tokens_map.json b/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/snapshots/26672447463f314a180021ded1522a55ce5b1090/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3c1d04911c269b925af977a3151c9704e990e4d0 --- /dev/null +++ b/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/snapshots/26672447463f314a180021ded1522a55ce5b1090/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/snapshots/26672447463f314a180021ded1522a55ce5b1090/tokenizer.json b/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/snapshots/26672447463f314a180021ded1522a55ce5b1090/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/snapshots/26672447463f314a180021ded1522a55ce5b1090/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/snapshots/26672447463f314a180021ded1522a55ce5b1090/tokenizer_config.json b/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/snapshots/26672447463f314a180021ded1522a55ce5b1090/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..df9f7c982cc0add6ab016b5d8e42e2609f59dec8 --- /dev/null +++ b/huggingface_tokenizers_cache/models--unsloth--llama-3.1-8b-instruct-bnb-4bit/snapshots/26672447463f314a180021ded1522a55ce5b1090/tokenizer_config.json @@ -0,0 +1,2067 @@ +{ + "add_bos_token": true, + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|finetune_right_pad_id|>", + "padding_side": "left", + "tokenizer_class": "PreTrainedTokenizer", + "unk_token": null +} diff --git a/processed_files.db b/processed_files.db new file mode 100644 index 0000000000000000000000000000000000000000..c48d7af1572d875e4a3431a28610d1932828c392 Binary files /dev/null and b/processed_files.db differ diff --git a/unsloth_compiled_cache/UnslothAlignPropTrainer.py b/unsloth_compiled_cache/UnslothAlignPropTrainer.py new file mode 100644 index 0000000000000000000000000000000000000000..fdef990a597bb1ccb676ede258ed31132fbf6cd8 --- /dev/null +++ b/unsloth_compiled_cache/UnslothAlignPropTrainer.py @@ -0,0 +1,653 @@ +""" +2025.7.4 +2025.7.3 +4.53.2 +0.19.1 +__UNSLOTH_VERSIONING__ +""" +from torch import Tensor +import torch +import torch.nn as nn +from torch.nn import functional as F +from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable +from trl.trainer.alignprop_trainer import (Accelerator, AlignPropConfig, AlignPropTrainer, Any, Callable, DDPOStableDiffusionPipeline, Optional, Path, ProjectConfiguration, PyTorchModelHubMixin, Union, defaultdict, generate_model_card, get_comet_experiment_url, is_wandb_available, logger, os, set_seed, textwrap, torch, wandb, warn) + + +import os +from typing import * +from dataclasses import dataclass, field +from packaging.version import Version +import torch +import numpy as np +from contextlib import nullcontext +from torch.nn import functional as F +from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling + +torch_compile_options = { + "epilogue_fusion" : True, + "max_autotune" : False, + "shape_padding" : True, + "trace.enabled" : False, + "triton.cudagraphs" : False, +} + +@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) +def selective_log_softmax(logits, index): + logits = logits.to(torch.float32) + selected_logits = torch.gather(logits, dim = -1, index = index.unsqueeze(-1)).squeeze(-1) + # loop to reduce peak mem consumption + # logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits]) + logsumexp_values = torch.logsumexp(logits, dim = -1) + per_token_logps = selected_logits - logsumexp_values # log_softmax(x_i) = x_i - logsumexp(x) + return per_token_logps +@dataclass +class UnslothAlignPropConfig(AlignPropConfig): + """ + + Configuration class for the [`AlignPropTrainer`]. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + exp_name (`str`, *optional*, defaults to `os.path.basename(sys.argv[0])[: -len(".py")]`): + Name of this experiment (defaults to the file name without the extension). + run_name (`str`, *optional*, defaults to `""`): + Name of this run. + seed (`int`, *optional*, defaults to `0`): + Random seed for reproducibility. + log_with (`str` or `None`, *optional*, defaults to `None`): + Log with either `"wandb"` or `"tensorboard"`. Check + [tracking](https://huggingface.co/docs/accelerate/usage_guides/tracking) for more details. + log_image_freq (`int`, *optional*, defaults to `1`): + Frequency for logging images. + tracker_kwargs (`dict[str, Any]`, *optional*, defaults to `{}`): + Keyword arguments for the tracker (e.g., `wandb_project`). + accelerator_kwargs (`dict[str, Any]`, *optional*, defaults to `{}`): + Keyword arguments for the accelerator. + project_kwargs (`dict[str, Any]`, *optional*, defaults to `{}`): + Keyword arguments for the accelerator project config (e.g., `logging_dir`). + tracker_project_name (`str`, *optional*, defaults to `"trl"`): + Name of project to use for tracking. + logdir (`str`, *optional*, defaults to `"logs"`): + Top-level logging directory for checkpoint saving. + num_epochs (`int`, *optional*, defaults to `100`): + Number of epochs to train. + save_freq (`int`, *optional*, defaults to `1`): + Number of epochs between saving model checkpoints. + num_checkpoint_limit (`int`, *optional*, defaults to `5`): + Number of checkpoints to keep before overwriting old ones. + mixed_precision (`str`, *optional*, defaults to `"fp16"`): + Mixed precision training. + allow_tf32 (`bool`, *optional*, defaults to `True`): + Allow `tf32` on Ampere GPUs. + resume_from (`str`, *optional*, defaults to `""`): + Path to resume training from a checkpoint. + sample_num_steps (`int`, *optional*, defaults to `50`): + Number of sampler inference steps. + sample_eta (`float`, *optional*, defaults to `1.0`): + Eta parameter for the DDIM sampler. + sample_guidance_scale (`float`, *optional*, defaults to `5.0`): + Classifier-free guidance weight. + train_batch_size (`int`, *optional*, defaults to `1`): + Batch size for training. + train_use_8bit_adam (`bool`, *optional*, defaults to `False`): + Whether to use the 8bit Adam optimizer from `bitsandbytes`. + train_learning_rate (`float`, *optional*, defaults to `1e-3`): + Learning rate. + train_adam_beta1 (`float`, *optional*, defaults to `0.9`): + Beta1 for Adam optimizer. + train_adam_beta2 (`float`, *optional*, defaults to `0.999`): + Beta2 for Adam optimizer. + train_adam_weight_decay (`float`, *optional*, defaults to `1e-4`): + Weight decay for Adam optimizer. + train_adam_epsilon (`float`, *optional*, defaults to `1e-8`): + Epsilon value for Adam optimizer. + train_gradient_accumulation_steps (`int`, *optional*, defaults to `1`): + Number of gradient accumulation steps. + train_max_grad_norm (`float`, *optional*, defaults to `1.0`): + Maximum gradient norm for gradient clipping. + negative_prompts (`str` or `None`, *optional*, defaults to `None`): + Comma-separated list of prompts to use as negative examples. + truncated_backprop_rand (`bool`, *optional*, defaults to `True`): + If `True`, randomized truncation to different diffusion timesteps is used. + truncated_backprop_timestep (`int`, *optional*, defaults to `49`): + Absolute timestep to which the gradients are backpropagated. Used only if `truncated_backprop_rand=False`. + truncated_rand_backprop_minmax (`tuple[int, int]`, *optional*, defaults to `(0, 50)`): + Range of diffusion timesteps for randomized truncated backpropagation. + push_to_hub (`bool`, *optional*, defaults to `False`): + Whether to push the final model to the Hub. + + """ + vllm_sampling_params: Optional[Any] = field( + default = None, + metadata = {'help': 'vLLM SamplingParams'}, + ) + unsloth_num_chunks : Optional[int] = field( + default = -1, + metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'}, + ) + def __init__( + self, + exp_name = 'colab_kernel_launcher', + run_name = '', + seed = 3407, + log_with = None, + log_image_freq = 1, + tracker_project_name = 'trl', + logdir = 'logs', + num_epochs = 100, + save_freq = 1, + num_checkpoint_limit = 5, + mixed_precision = 'fp16', + allow_tf32 = True, + resume_from = '', + sample_num_steps = 50, + sample_eta = 1.0, + sample_guidance_scale = 5.0, + train_batch_size = 1, + train_use_8bit_adam = False, + train_learning_rate = 5e-05, + train_adam_beta1 = 0.9, + train_adam_beta2 = 0.999, + train_adam_weight_decay = 0.01, + train_adam_epsilon = 1e-08, + train_gradient_accumulation_steps = 2, + train_max_grad_norm = 1.0, + negative_prompts = None, + truncated_backprop_rand = True, + truncated_backprop_timestep = 49, + push_to_hub = False, + vllm_sampling_params = None, + unsloth_num_chunks = -1, + **kwargs, + ): + + super().__init__( + exp_name = exp_name, + run_name = run_name, + seed = seed, + log_with = log_with, + log_image_freq = log_image_freq, + tracker_project_name = tracker_project_name, + logdir = logdir, + num_epochs = num_epochs, + save_freq = save_freq, + num_checkpoint_limit = num_checkpoint_limit, + mixed_precision = mixed_precision, + allow_tf32 = allow_tf32, + resume_from = resume_from, + sample_num_steps = sample_num_steps, + sample_eta = sample_eta, + sample_guidance_scale = sample_guidance_scale, + train_batch_size = train_batch_size, + train_use_8bit_adam = train_use_8bit_adam, + train_learning_rate = train_learning_rate, + train_adam_beta1 = train_adam_beta1, + train_adam_beta2 = train_adam_beta2, + train_adam_weight_decay = train_adam_weight_decay, + train_adam_epsilon = train_adam_epsilon, + train_gradient_accumulation_steps = train_gradient_accumulation_steps, + train_max_grad_norm = train_max_grad_norm, + negative_prompts = negative_prompts, + truncated_backprop_rand = truncated_backprop_rand, + truncated_backprop_timestep = truncated_backprop_timestep, + push_to_hub = push_to_hub,**kwargs) + self.vllm_sampling_params = vllm_sampling_params + self.unsloth_num_chunks = unsloth_num_chunks +pass + +class _UnslothAlignPropTrainer(PyTorchModelHubMixin): + """""" + + _tag_names = ["trl", "alignprop"] + + def __init__( + self, + config: AlignPropConfig, + reward_function: Callable[[torch.Tensor, tuple[str], tuple[Any]], torch.Tensor], + prompt_function: Callable[[], tuple[str, Any]], + sd_pipeline: DDPOStableDiffusionPipeline, + image_samples_hook: Optional[Callable[[Any, Any, Any], Any]] = None, + ): + if image_samples_hook is None: + warn("No image_samples_hook provided; no images will be logged") + + self.prompt_fn = prompt_function + self.reward_fn = reward_function + self.config = config + self.image_samples_callback = image_samples_hook + + accelerator_project_config = ProjectConfiguration(**self.config.project_kwargs) + + if self.config.resume_from: + self.config.resume_from = os.path.normpath(os.path.expanduser(self.config.resume_from)) + if "checkpoint_" not in os.path.basename(self.config.resume_from): + # get the most recent checkpoint in this directory + checkpoints = list( + filter( + lambda x: "checkpoint_" in x, + os.listdir(self.config.resume_from), + ) + ) + if len(checkpoints) == 0: + raise ValueError(f"No checkpoints found in {self.config.resume_from}") + checkpoint_numbers = sorted([int(x.split("_")[-1]) for x in checkpoints]) + self.config.resume_from = os.path.join( + self.config.resume_from, + f"checkpoint_{checkpoint_numbers[-1]}", + ) + + accelerator_project_config.iteration = checkpoint_numbers[-1] + 1 + + self.accelerator = Accelerator( + log_with=self.config.log_with, + mixed_precision=self.config.mixed_precision, + project_config=accelerator_project_config, + # we always accumulate gradients across timesteps; we want config.train.gradient_accumulation_steps to be the + # number of *samples* we accumulate across, so we need to multiply by the number of training timesteps to get + # the total number of optimizer steps to accumulate across. + gradient_accumulation_steps=self.config.train_gradient_accumulation_steps, + **self.config.accelerator_kwargs, + ) + + is_using_tensorboard = config.log_with is not None and config.log_with == "tensorboard" + + if self.accelerator.is_main_process: + self.accelerator.init_trackers( + self.config.tracker_project_name, + config=dict(alignprop_trainer_config=config.to_dict()) + if not is_using_tensorboard + else config.to_dict(), + init_kwargs=self.config.tracker_kwargs, + ) + + logger.info(f"\n{config}") + + set_seed(self.config.seed, device_specific=True) + + self.sd_pipeline = sd_pipeline + + self.sd_pipeline.set_progress_bar_config( + position=1, + disable=not self.accelerator.is_local_main_process, + leave=False, + desc="Timestep", + dynamic_ncols=True, + ) + + # For mixed precision training we cast all non-trainable weights [vae, non-lora text_encoder and non-lora unet] to half-precision + # as these weights are only used for inference, keeping weights in full precision is not required. + if self.accelerator.mixed_precision == "fp16": + inference_dtype = torch.float16 + elif self.accelerator.mixed_precision == "bf16": + inference_dtype = torch.bfloat16 + else: + inference_dtype = torch.float32 + + self.sd_pipeline.vae.to(self.accelerator.device, dtype=inference_dtype) + self.sd_pipeline.text_encoder.to(self.accelerator.device, dtype=inference_dtype) + self.sd_pipeline.unet.to(self.accelerator.device, dtype=inference_dtype) + + trainable_layers = self.sd_pipeline.get_trainable_layers() + + self.accelerator.register_save_state_pre_hook(self._save_model_hook) + self.accelerator.register_load_state_pre_hook(self._load_model_hook) + + # Enable TF32 for faster training on Ampere GPUs, + # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices + if self.config.allow_tf32: + torch.backends.cuda.matmul.allow_tf32 = True + + self.optimizer = self._setup_optimizer( + trainable_layers.parameters() if not isinstance(trainable_layers, list) else trainable_layers + ) + + self.neg_prompt_embed = self.sd_pipeline.text_encoder( + self.sd_pipeline.tokenizer( + [""] if self.config.negative_prompts is None else self.config.negative_prompts, + return_tensors="pt", + padding="max_length", + truncation=True, + max_length=self.sd_pipeline.tokenizer.model_max_length, + ).input_ids.to(self.accelerator.device) + )[0] + + # NOTE: for some reason, autocast is necessary for non-lora training but for lora training it isn't necessary and it uses + # more memory + self.autocast = self.sd_pipeline.autocast or self.accelerator.autocast + + if hasattr(self.sd_pipeline, "use_lora") and self.sd_pipeline.use_lora: + unet, self.optimizer = self.accelerator.prepare(trainable_layers, self.optimizer) + self.trainable_layers = list(filter(lambda p: p.requires_grad, unet.parameters())) + else: + self.trainable_layers, self.optimizer = self.accelerator.prepare(trainable_layers, self.optimizer) + + if config.resume_from: + logger.info(f"Resuming from {config.resume_from}") + self.accelerator.load_state(config.resume_from) + self.first_epoch = int(config.resume_from.split("_")[-1]) + 1 + else: + self.first_epoch = 0 + + def compute_rewards(self, prompt_image_pairs): + reward, reward_metadata = self.reward_fn( + prompt_image_pairs["images"], prompt_image_pairs["prompts"], prompt_image_pairs["prompt_metadata"] + ) + return reward + + def step(self, epoch: int, global_step: int): + """ + Perform a single step of training. + + Args: + epoch (int): The current epoch. + global_step (int): The current global step. + + Side Effects: + - Model weights are updated + - Logs the statistics to the accelerator trackers. + - If `self.image_samples_callback` is not None, it will be called with the prompt_image_pairs, global_step, + and the accelerator tracker. + + Returns: + global_step (int): The updated global step. + """ + info = defaultdict(list) + + self.sd_pipeline.unet.train() + + for _ in range(self.config.train_gradient_accumulation_steps): + with self.accelerator.accumulate(self.sd_pipeline.unet), self.autocast(), torch.enable_grad(): + prompt_image_pairs = self._generate_samples( + batch_size=self.config.train_batch_size, + ) + + rewards = self.compute_rewards(prompt_image_pairs) + + prompt_image_pairs["rewards"] = rewards + + rewards_vis = self.accelerator.gather(rewards).detach().cpu().numpy() + + loss = self.calculate_loss(rewards) + + self.accelerator.backward(loss) + + if self.accelerator.sync_gradients: + self.accelerator.clip_grad_norm_( + self.trainable_layers.parameters() + if not isinstance(self.trainable_layers, list) + else self.trainable_layers, + self.config.train_max_grad_norm, + ) + + self.optimizer.step() + self.optimizer.zero_grad() + + info["reward_mean"].append(rewards_vis.mean()) + info["reward_std"].append(rewards_vis.std()) + info["loss"].append(loss.item()) + + # Checks if the accelerator has performed an optimization step behind the scenes + if self.accelerator.sync_gradients: + # log training-related stuff + info = {k: torch.mean(torch.tensor(v)) for k, v in info.items()} + info = self.accelerator.reduce(info, reduction="mean") + info.update({"epoch": epoch}) + self.accelerator.log(info, step=global_step) + global_step += 1 + info = defaultdict(list) + else: + raise ValueError( + "Optimization step should have been performed by this point. Please check calculated gradient accumulation settings." + ) + # Logs generated images + if self.image_samples_callback is not None and global_step % self.config.log_image_freq == 0: + self.image_samples_callback(prompt_image_pairs, global_step, self.accelerator.trackers[0]) + + if epoch != 0 and epoch % self.config.save_freq == 0 and self.accelerator.is_main_process: + self.accelerator.save_state() + + return global_step + + def calculate_loss(self, rewards): + """ + Calculate the loss for a batch of an unpacked sample + + Args: + rewards (torch.Tensor): + Differentiable reward scalars for each generated image, shape: [batch_size] + + Returns: + loss (torch.Tensor) (all of these are of shape (1,)) + """ + # Loss is specific to Aesthetic Reward function used in AlignProp (https://huggingface.co/papers/2310.03739) + loss = 10.0 - (rewards).mean() + return loss + + def loss( + self, + advantages: torch.Tensor, + clip_range: float, + ratio: torch.Tensor, + ): + unclipped_loss = -advantages * ratio + clipped_loss = -advantages * torch.clamp( + ratio, + 1.0 - clip_range, + 1.0 + clip_range, + ) + return torch.mean(torch.maximum(unclipped_loss, clipped_loss)) + + def _setup_optimizer(self, trainable_layers_parameters): + if self.config.train_use_8bit_adam: + import bitsandbytes + + optimizer_cls = bitsandbytes.optim.AdamW8bit + else: + optimizer_cls = torch.optim.AdamW + + return optimizer_cls( + trainable_layers_parameters, + lr=self.config.train_learning_rate, + betas=(self.config.train_adam_beta1, self.config.train_adam_beta2), + weight_decay=self.config.train_adam_weight_decay, + eps=self.config.train_adam_epsilon, + ) + + def _save_model_hook(self, models, weights, output_dir): + self.sd_pipeline.save_checkpoint(models, weights, output_dir) + weights.pop() # ensures that accelerate doesn't try to handle saving of the model + + def _load_model_hook(self, models, input_dir): + self.sd_pipeline.load_checkpoint(models, input_dir) + models.pop() # ensures that accelerate doesn't try to handle loading of the model + + def _generate_samples(self, batch_size, with_grad=True, prompts=None): + """ + Generate samples from the model + + Args: + batch_size (int): Batch size to use for sampling + with_grad (bool): Whether the generated RGBs should have gradients attached to it. + + Returns: + prompt_image_pairs (dict[Any]) + """ + prompt_image_pairs = {} + + sample_neg_prompt_embeds = self.neg_prompt_embed.repeat(batch_size, 1, 1) + + if prompts is None: + prompts, prompt_metadata = zip(*[self.prompt_fn() for _ in range(batch_size)]) + else: + prompt_metadata = [{} for _ in range(batch_size)] + + prompt_ids = self.sd_pipeline.tokenizer( + prompts, + return_tensors="pt", + padding="max_length", + truncation=True, + max_length=self.sd_pipeline.tokenizer.model_max_length, + ).input_ids.to(self.accelerator.device) + + prompt_embeds = self.sd_pipeline.text_encoder(prompt_ids)[0] + + if with_grad: + sd_output = self.sd_pipeline.rgb_with_grad( + prompt_embeds=prompt_embeds, + negative_prompt_embeds=sample_neg_prompt_embeds, + num_inference_steps=self.config.sample_num_steps, + guidance_scale=self.config.sample_guidance_scale, + eta=self.config.sample_eta, + truncated_backprop_rand=self.config.truncated_backprop_rand, + truncated_backprop_timestep=self.config.truncated_backprop_timestep, + truncated_rand_backprop_minmax=self.config.truncated_rand_backprop_minmax, + output_type="pt", + ) + else: + sd_output = self.sd_pipeline( + prompt_embeds=prompt_embeds, + negative_prompt_embeds=sample_neg_prompt_embeds, + num_inference_steps=self.config.sample_num_steps, + guidance_scale=self.config.sample_guidance_scale, + eta=self.config.sample_eta, + output_type="pt", + ) + + images = sd_output.images + + prompt_image_pairs["images"] = images + prompt_image_pairs["prompts"] = prompts + prompt_image_pairs["prompt_metadata"] = prompt_metadata + + return prompt_image_pairs + + def train(self, epochs: Optional[int] = None): + """ + Train the model for a given number of epochs + """ + global_step = 0 + if epochs is None: + epochs = self.config.num_epochs + for epoch in range(self.first_epoch, epochs): + global_step = self.step(epoch, global_step) + + def _save_pretrained(self, save_directory): + self.sd_pipeline.save_pretrained(save_directory) + self.create_model_card() + + # Ensure the model card is saved along with the checkpoint + def _save_checkpoint(self, model, trial): + if self.args.hub_model_id is None: + model_name = Path(self.args.output_dir).name + else: + model_name = self.args.hub_model_id.split("/")[-1] + self.create_model_card(model_name=model_name) + super()._save_checkpoint(model, trial) + + def create_model_card( + self, + model_name: Optional[str] = None, + dataset_name: Optional[str] = None, + tags: Union[str, list[str], None] = None, + ): + """ + Creates a draft of a model card using the information available to the `Trainer`. + + Args: + model_name (`str` or `None`, *optional*, defaults to `None`): + Name of the model. + dataset_name (`str` or `None`, *optional*, defaults to `None`): + Name of the dataset used for training. + tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`): + Tags to be associated with the model card. + """ + if not self.is_world_process_zero(): + return + + if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path): + base_model = self.model.config._name_or_path + else: + base_model = None + + # normalize `tags` to a mutable set + if tags is None: + tags = set() + elif isinstance(tags, str): + tags = {tags} + else: + tags = set(tags) + + if hasattr(self.model.config, "unsloth_version"): + tags.add("unsloth") + + tags.update(self._tag_names) + + citation = textwrap.dedent("""\ + @article{prabhudesai2024aligning, + title = {{Aligning Text-to-Image Diffusion Models with Reward Backpropagation}}, + author = {Mihir Prabhudesai and Anirudh Goyal and Deepak Pathak and Katerina Fragkiadaki}, + year = 2024, + eprint = {arXiv:2310.03739} + }""") + + model_card = generate_model_card( + base_model=base_model, + model_name=model_name, + hub_model_id=self.hub_model_id, + dataset_name=dataset_name, + tags=tags, + wandb_url=wandb.run.get_url() if is_wandb_available() and wandb.run is not None else None, + comet_url=get_comet_experiment_url(), + trainer_name="AlignProp", + trainer_citation=citation, + paper_title="Aligning Text-to-Image Diffusion Models with Reward Backpropagation", + paper_id="2310.03739", + ) + + model_card.save(os.path.join(self.args.output_dir, "README.md")) +class UnslothAlignPropTrainer(_UnslothAlignPropTrainer): + """ + + The AlignPropTrainer uses Deep Diffusion Policy Optimization to optimise diffusion models. Note, this trainer is + heavily inspired by the work here: https://github.com/mihirp1998/AlignProp/ As of now only Stable Diffusion based + pipelines are supported + + Attributes: + config (`AlignPropConfig`): + Configuration object for AlignPropTrainer. Check the documentation of `PPOConfig` for more details. + reward_function (`Callable[[torch.Tensor, tuple[str], tuple[Any]], torch.Tensor]`): + Reward function to be used + prompt_function (`Callable[[], tuple[str, Any]]`): + Function to generate prompts to guide model + sd_pipeline (`DDPOStableDiffusionPipeline`): + Stable Diffusion pipeline to be used for training. + image_samples_hook (`Optional[Callable[[Any, Any, Any], Any]]`): + Hook to be called to log images + + """ + def __init__( + self, + config, + reward_function, + prompt_function, + sd_pipeline, + image_samples_hook = None, + **kwargs + ): + if args is None: args = UnslothAlignPropConfig() + other_metrics = [] + + from unsloth_zoo.logging_utils import PatchRLStatistics + PatchRLStatistics('alignprop_trainer', other_metrics) + + super().__init__( + config = config, + reward_function = reward_function, + prompt_function = prompt_function, + sd_pipeline = sd_pipeline, + image_samples_hook = image_samples_hook,**kwargs) + +pass diff --git a/unsloth_compiled_cache/UnslothBCOTrainer.py b/unsloth_compiled_cache/UnslothBCOTrainer.py new file mode 100644 index 0000000000000000000000000000000000000000..9622b9bbbf5125a2c90d105f7c439a541ed49596 --- /dev/null +++ b/unsloth_compiled_cache/UnslothBCOTrainer.py @@ -0,0 +1,1823 @@ +""" +2025.7.4 +2025.7.3 +4.53.2 +0.19.1 +__UNSLOTH_VERSIONING__ +""" +from torch import Tensor +import torch +import torch.nn as nn +from torch.nn import functional as F +from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable +from trl.trainer.bco_trainer import (Any, AutoModelForCausalLM, BCOConfig, BCOTrainer, BaseImageProcessor, CLF_NAME, Callable, DPODataCollatorWithPadding, DataCollator, DataLoader, Dataset, EvalLoopOutput, F, FeatureExtractionMixin, Literal, LogisticRegression, Optional, PartialState, Path, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, RUNNING_NAME, RunningMoments, SequentialSampler, Trainer, TrainerCallback, TrainingArguments, Union, _process_tokens, _tokenize, autocast, contextmanager, create_reference_model, defaultdict, disable_dropout_in_model, generate_model_card, get_comet_experiment_url, has_length, inspect, is_comet_available, is_joblib_available, is_peft_available, is_sklearn_available, is_wandb_available, itemgetter, joblib, log_table_to_comet_experiment, logger, maybe_apply_chat_template, nn, np, nullcontext, os, pad_to_length, pd, peft_module_casting_to_bf16, prepare_deepspeed, prepare_model_for_kbit_training, random, textwrap, torch, tqdm, wandb, warnings, F, Optional, PeftModel, PreTrainedModel, Trainer, is_peft_available, logger, os, torch) + + +import os +from typing import * +from dataclasses import dataclass, field +from packaging.version import Version +import torch +import numpy as np +from contextlib import nullcontext +from torch.nn import functional as F +from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling + +torch_compile_options = { + "epilogue_fusion" : True, + "max_autotune" : False, + "shape_padding" : True, + "trace.enabled" : False, + "triton.cudagraphs" : False, +} + +@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) +def selective_log_softmax(logits, index): + logits = logits.to(torch.float32) + selected_logits = torch.gather(logits, dim = -1, index = index.unsqueeze(-1)).squeeze(-1) + # loop to reduce peak mem consumption + # logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits]) + logsumexp_values = torch.logsumexp(logits, dim = -1) + per_token_logps = selected_logits - logsumexp_values # log_softmax(x_i) = x_i - logsumexp(x) + return per_token_logps +@dataclass +class UnslothBCOConfig(BCOConfig): + """ + + Configuration class for the [`BCOTrainer`]. + + This class includes only the parameters that are specific to BCO training. For a full list of training arguments, + please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may + differ from those in [`~transformers.TrainingArguments`]. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + max_length (`int` or `None`, *optional*, defaults to `1024`): + Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want + to use the default data collator. + max_prompt_length (`int` or `None`, *optional*, defaults to `512`): + Maximum length of the prompt. This argument is required if you want to use the default data collator. + max_completion_length (`int` or `None`, *optional*, defaults to `None`): + Maximum length of the completion. This argument is required if you want to use the default data collator + and your model is an encoder-decoder. + beta (`float`, *optional*, defaults to `0.1`): + Parameter controlling the deviation from the reference model. Higher β means less deviation from the + reference model. + label_pad_token_id (`int`, *optional*, defaults to `-100`): + Label pad token id. This argument is required if you want to use the default data collator. + padding_value (`int` or `None`, *optional*, defaults to `None`): + Padding value to use. If `None`, the padding value of the tokenizer is used. + truncation_mode (`str`, *optional*, defaults to `"keep_end"`): + Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`. + This argument is required if you want to use the default data collator. + disable_dropout (`bool`, *optional*, defaults to `True`): + Whether to disable dropout in the model and reference model. + generate_during_eval (`bool`, *optional*, defaults to `False`): + If `True`, generates and logs completions from both the model and the reference model to W&B or Comet + during evaluation. + is_encoder_decoder (`bool` or `None`, *optional*, defaults to `None`): + When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument, + you need to specify if the model returned by the callable is an encoder-decoder model. + precompute_ref_log_probs (`bool`, *optional*, defaults to `False`): + Whether to precompute reference model log probabilities for training and evaluation datasets. This is + useful when training without the reference model to reduce the total GPU memory needed. + model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): + Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a + string. + ref_model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): + Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the reference model + from a string. + dataset_num_proc (`int` or `None`, *optional*, defaults to `None`): + Number of processes to use for processing the dataset. + prompt_sample_size (`int`, *optional*, defaults to `1024`): + Number of prompts that are fed to density ratio classifier. + min_density_ratio (`float`, *optional*, defaults to `0.5`): + Minimum value of the density ratio. The estimated density ratio is clamped to this value. + max_density_ratio (`float`, *optional*, defaults to `10.0`): + Maximum value of the density ratio. The estimated density ratio is clamped to this value. + + """ + vllm_sampling_params: Optional[Any] = field( + default = None, + metadata = {'help': 'vLLM SamplingParams'}, + ) + unsloth_num_chunks : Optional[int] = field( + default = -1, + metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'}, + ) + def __init__( + self, + output_dir = None, + overwrite_output_dir = None, + do_train = False, + do_eval = False, + do_predict = False, + eval_strategy = 'no', + prediction_loss_only = False, + per_device_train_batch_size = 4, + per_device_eval_batch_size = 4, + per_gpu_train_batch_size = None, + per_gpu_eval_batch_size = None, + gradient_accumulation_steps = 2, + eval_accumulation_steps = 2, + eval_delay = 0, + torch_empty_cache_steps = 250, + learning_rate = 5e-05, + weight_decay = 0.01, + adam_beta1 = 0.9, + adam_beta2 = 0.999, + adam_epsilon = 1e-08, + max_grad_norm = 1.0, + num_train_epochs = 3.0, + max_steps = -1, + lr_scheduler_type = 'linear', + warmup_ratio = 0.1, + warmup_steps = 0, + log_level = 'passive', + log_level_replica = 'warning', + log_on_each_node = True, + logging_dir = None, + logging_strategy = 'steps', + logging_first_step = False, + logging_steps = 1, + logging_nan_inf_filter = False, + save_strategy = 'steps', + save_steps = 500, + save_total_limit = None, + save_safetensors = True, + save_on_each_node = False, + save_only_model = False, + restore_callback_states_from_checkpoint = False, + no_cuda = False, + use_cpu = False, + use_mps_device = False, + seed = 3407, + data_seed = 3407, + jit_mode_eval = False, + use_ipex = False, + bf16 = False, + fp16 = False, + fp16_opt_level = 'O1', + half_precision_backend = 'auto', + bf16_full_eval = False, + fp16_full_eval = False, + tf32 = None, + local_rank = -1, + ddp_backend = None, + tpu_num_cores = None, + tpu_metrics_debug = False, + debug = '', + dataloader_drop_last = False, + eval_steps = None, + dataloader_num_workers = 0, + dataloader_prefetch_factor = None, + past_index = -1, + run_name = None, + disable_tqdm = None, + remove_unused_columns = True, + label_names = None, + load_best_model_at_end = False, + metric_for_best_model = None, + greater_is_better = None, + ignore_data_skip = False, + fsdp = '', + fsdp_min_num_params = 0, + fsdp_config = None, + fsdp_transformer_layer_cls_to_wrap = None, + accelerator_config = None, + deepspeed = None, + label_smoothing_factor = 0.0, + optim = 'adamw_8bit', + optim_args = None, + adafactor = False, + group_by_length = False, + length_column_name = 'length', + report_to = None, + ddp_find_unused_parameters = None, + ddp_bucket_cap_mb = None, + ddp_broadcast_buffers = None, + dataloader_pin_memory = True, + dataloader_persistent_workers = False, + skip_memory_metrics = True, + use_legacy_prediction_loop = False, + push_to_hub = False, + resume_from_checkpoint = None, + hub_model_id = None, + hub_strategy = 'every_save', + hub_token = None, + hub_private_repo = None, + hub_always_push = False, + hub_revision = None, + gradient_checkpointing = False, + gradient_checkpointing_kwargs = None, + include_inputs_for_metrics = False, + eval_do_concat_batches = True, + fp16_backend = 'auto', + push_to_hub_model_id = None, + push_to_hub_organization = None, + push_to_hub_token = None, + mp_parameters = '', + auto_find_batch_size = False, + full_determinism = False, + torchdynamo = None, + ray_scope = 'last', + ddp_timeout = 1800, + torch_compile = False, + torch_compile_backend = None, + torch_compile_mode = None, + include_tokens_per_second = False, + include_num_input_tokens_seen = False, + neftune_noise_alpha = None, + optim_target_modules = None, + batch_eval_metrics = False, + eval_on_start = False, + use_liger_kernel = False, + liger_kernel_config = None, + eval_use_gather_object = False, + average_tokens_across_devices = False, + max_length = 1024, + max_prompt_length = 512, + max_completion_length = None, + beta = 0.1, + label_pad_token_id = -100, + padding_value = None, + truncation_mode = 'keep_end', + disable_dropout = True, + generate_during_eval = False, + is_encoder_decoder = None, + precompute_ref_log_probs = False, + model_init_kwargs = None, + ref_model_init_kwargs = None, + dataset_num_proc = None, + prompt_sample_size = 1024, + min_density_ratio = 0.5, + max_density_ratio = 10.0, + vllm_sampling_params = None, + unsloth_num_chunks = -1, + **kwargs, + ): + if learning_rate < 1e-7: raise FloatingPointError(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!') + if learning_rate > 1: raise OverflowError(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!') + if output_dir is None and save_strategy == 'steps' and save_steps == 500: + output_dir = 'unsloth_training_checkpoints' + save_strategy = 'no' + if dataset_num_proc is None: + from multiprocessing import cpu_count + dataset_num_proc = cpu_count() + + super().__init__( + output_dir = output_dir, + overwrite_output_dir = overwrite_output_dir, + do_train = do_train, + do_eval = do_eval, + do_predict = do_predict, + eval_strategy = eval_strategy, + prediction_loss_only = prediction_loss_only, + per_device_train_batch_size = per_device_train_batch_size, + per_device_eval_batch_size = per_device_eval_batch_size, + per_gpu_train_batch_size = per_gpu_train_batch_size, + per_gpu_eval_batch_size = per_gpu_eval_batch_size, + gradient_accumulation_steps = gradient_accumulation_steps, + eval_accumulation_steps = eval_accumulation_steps, + eval_delay = eval_delay, + torch_empty_cache_steps = torch_empty_cache_steps, + learning_rate = learning_rate, + weight_decay = weight_decay, + adam_beta1 = adam_beta1, + adam_beta2 = adam_beta2, + adam_epsilon = adam_epsilon, + max_grad_norm = max_grad_norm, + num_train_epochs = num_train_epochs, + max_steps = max_steps, + lr_scheduler_type = lr_scheduler_type, + warmup_ratio = warmup_ratio, + warmup_steps = warmup_steps, + log_level = log_level, + log_level_replica = log_level_replica, + log_on_each_node = log_on_each_node, + logging_dir = logging_dir, + logging_strategy = logging_strategy, + logging_first_step = logging_first_step, + logging_steps = logging_steps, + logging_nan_inf_filter = logging_nan_inf_filter, + save_strategy = save_strategy, + save_steps = save_steps, + save_total_limit = save_total_limit, + save_safetensors = save_safetensors, + save_on_each_node = save_on_each_node, + save_only_model = save_only_model, + restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint, + no_cuda = no_cuda, + use_cpu = use_cpu, + use_mps_device = use_mps_device, + seed = seed, + data_seed = data_seed, + jit_mode_eval = jit_mode_eval, + use_ipex = use_ipex, + bf16 = bf16, + fp16 = fp16, + fp16_opt_level = fp16_opt_level, + half_precision_backend = half_precision_backend, + bf16_full_eval = bf16_full_eval, + fp16_full_eval = fp16_full_eval, + tf32 = tf32, + local_rank = local_rank, + ddp_backend = ddp_backend, + tpu_num_cores = tpu_num_cores, + tpu_metrics_debug = tpu_metrics_debug, + debug = debug, + dataloader_drop_last = dataloader_drop_last, + eval_steps = eval_steps, + dataloader_num_workers = dataloader_num_workers, + dataloader_prefetch_factor = dataloader_prefetch_factor, + past_index = past_index, + run_name = run_name, + disable_tqdm = disable_tqdm, + remove_unused_columns = remove_unused_columns, + label_names = label_names, + load_best_model_at_end = load_best_model_at_end, + metric_for_best_model = metric_for_best_model, + greater_is_better = greater_is_better, + ignore_data_skip = ignore_data_skip, + fsdp = fsdp, + fsdp_min_num_params = fsdp_min_num_params, + fsdp_config = fsdp_config, + fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap, + accelerator_config = accelerator_config, + deepspeed = deepspeed, + label_smoothing_factor = label_smoothing_factor, + optim = optim, + optim_args = optim_args, + adafactor = adafactor, + group_by_length = group_by_length, + length_column_name = length_column_name, + report_to = report_to, + ddp_find_unused_parameters = ddp_find_unused_parameters, + ddp_bucket_cap_mb = ddp_bucket_cap_mb, + ddp_broadcast_buffers = ddp_broadcast_buffers, + dataloader_pin_memory = dataloader_pin_memory, + dataloader_persistent_workers = dataloader_persistent_workers, + skip_memory_metrics = skip_memory_metrics, + use_legacy_prediction_loop = use_legacy_prediction_loop, + push_to_hub = push_to_hub, + resume_from_checkpoint = resume_from_checkpoint, + hub_model_id = hub_model_id, + hub_strategy = hub_strategy, + hub_token = hub_token, + hub_private_repo = hub_private_repo, + hub_always_push = hub_always_push, + hub_revision = hub_revision, + gradient_checkpointing = gradient_checkpointing, + gradient_checkpointing_kwargs = gradient_checkpointing_kwargs, + include_inputs_for_metrics = include_inputs_for_metrics, + eval_do_concat_batches = eval_do_concat_batches, + fp16_backend = fp16_backend, + push_to_hub_model_id = push_to_hub_model_id, + push_to_hub_organization = push_to_hub_organization, + push_to_hub_token = push_to_hub_token, + mp_parameters = mp_parameters, + auto_find_batch_size = auto_find_batch_size, + full_determinism = full_determinism, + torchdynamo = torchdynamo, + ray_scope = ray_scope, + ddp_timeout = ddp_timeout, + torch_compile = torch_compile, + torch_compile_backend = torch_compile_backend, + torch_compile_mode = torch_compile_mode, + include_tokens_per_second = include_tokens_per_second, + include_num_input_tokens_seen = include_num_input_tokens_seen, + neftune_noise_alpha = neftune_noise_alpha, + optim_target_modules = optim_target_modules, + batch_eval_metrics = batch_eval_metrics, + eval_on_start = eval_on_start, + use_liger_kernel = use_liger_kernel, + liger_kernel_config = liger_kernel_config, + eval_use_gather_object = eval_use_gather_object, + average_tokens_across_devices = average_tokens_across_devices, + max_length = max_length, + max_prompt_length = max_prompt_length, + max_completion_length = max_completion_length, + beta = beta, + label_pad_token_id = label_pad_token_id, + padding_value = padding_value, + truncation_mode = truncation_mode, + disable_dropout = disable_dropout, + generate_during_eval = generate_during_eval, + is_encoder_decoder = is_encoder_decoder, + precompute_ref_log_probs = precompute_ref_log_probs, + model_init_kwargs = model_init_kwargs, + ref_model_init_kwargs = ref_model_init_kwargs, + dataset_num_proc = dataset_num_proc, + prompt_sample_size = prompt_sample_size, + min_density_ratio = min_density_ratio, + max_density_ratio = max_density_ratio,**kwargs) + self.vllm_sampling_params = vllm_sampling_params + self.unsloth_num_chunks = unsloth_num_chunks +pass + +class _UnslothBCOTrainer(Trainer): + r"""""" + + _tag_names = ["trl", "bco"] + + def __init__( + self, + model: Union[PreTrainedModel, nn.Module, str] = None, + ref_model: Optional[Union[PreTrainedModel, nn.Module, str]] = None, + args: BCOConfig = None, + train_dataset: Optional[Dataset] = None, + eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None, + processing_class: Optional[ + Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin] + ] = None, + data_collator: Optional[DataCollator] = None, + model_init: Optional[Callable[[], PreTrainedModel]] = None, + callbacks: Optional[list[TrainerCallback]] = None, + optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), + preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, + peft_config: Optional[dict] = None, + compute_metrics: Optional[Callable[[EvalLoopOutput], dict]] = None, + model_adapter_name: Optional[str] = None, + ref_adapter_name: Optional[str] = None, + embedding_func: Optional[Callable] = None, + embedding_tokenizer: Optional[PreTrainedTokenizerBase] = None, + ): + if embedding_func is not None and not (is_sklearn_available() and is_joblib_available()): + raise ImportError( + "BCOTrainer with UDM requires the scikit-learn and joblib libraries. Please install it with `pip install scikit-learn joblib`." + ) + + if type(args) is TrainingArguments: + raise ValueError("Please use `BCOConfig` instead `TrainingArguments`.") + + if not isinstance(model, str) and model is not None and ref_model is model: + raise ValueError( + "`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the " + "same as `model`, you must mass a copy of it, or `None` if you use peft." + ) + + if args.model_init_kwargs is None: + model_init_kwargs = {} + elif not isinstance(model, str): + raise ValueError("You passed model_kwargs to the BCOTrainer. But your model is already instantiated.") + else: + model_init_kwargs = args.model_init_kwargs + torch_dtype = model_init_kwargs.get("torch_dtype") + if torch_dtype is not None: + # Convert to `torch.dtype` if an str is passed + if isinstance(torch_dtype, str) and torch_dtype != "auto": + torch_dtype = getattr(torch, torch_dtype) + if torch_dtype != "auto" and not isinstance(torch_dtype, torch.dtype): + raise ValueError( + f"Invalid `torch_dtype` passed to the BCOConfig. Expected a string with either `torch.dtype` or 'auto', but got {torch_dtype}." + ) + model_init_kwargs["torch_dtype"] = torch_dtype + + if args.ref_model_init_kwargs is None: + ref_model_init_kwargs = {} + elif not isinstance(ref_model, str): + raise ValueError( + "You passed ref_model_kwargs to the BCOTrainer. But your ref_model is already instantiated." + ) + else: + ref_model_init_kwargs = args.ref_model_init_kwargs + torch_dtype = ref_model_init_kwargs.get("torch_dtype") + if torch_dtype is not None: + # Convert to `torch.dtype` if an str is passed + if isinstance(torch_dtype, str) and torch_dtype != "auto": + torch_dtype = getattr(torch, torch_dtype) + if torch_dtype != "auto" and not isinstance(torch_dtype, torch.dtype): + raise ValueError( + f"Invalid `torch_dtype` passed to the BCOConfig. Expected a string with either `torch.dtype` or 'auto', but got {torch_dtype}." + ) + ref_model_init_kwargs["torch_dtype"] = torch_dtype + + if isinstance(model, str): + model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs) + + if isinstance(ref_model, str): + ref_model = AutoModelForCausalLM.from_pretrained(ref_model, **ref_model_init_kwargs) + + # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16` + # has been called in order to properly call autocast if needed. + self._peft_has_been_casted_to_bf16 = False + + if not is_peft_available() and peft_config is not None: + raise ValueError( + "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it with `pip install peft` to use the PEFT models" + ) + elif is_peft_available() and peft_config is not None: + # if model is a peft model and we have a peft_config, we merge and unload it first + if isinstance(model, PeftModel): + model = model.merge_and_unload() + + if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False): + _support_gc_kwargs = hasattr( + args, "gradient_checkpointing_kwargs" + ) and "gradient_checkpointing_kwargs" in list( + inspect.signature(prepare_model_for_kbit_training).parameters + ) + + prepare_model_kwargs = {"use_gradient_checkpointing": args.gradient_checkpointing} + + if _support_gc_kwargs: + prepare_model_kwargs["gradient_checkpointing_kwargs"] = args.gradient_checkpointing_kwargs + + model = prepare_model_for_kbit_training(model, **prepare_model_kwargs) + elif args.gradient_checkpointing: + # For backward compatibility with older versions of transformers + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + else: + + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + # get peft model with the given config + model = model + if args.bf16 and getattr(model, "is_loaded_in_4bit", False): + peft_module_casting_to_bf16(model) + # If args.bf16 we need to explicitly call `generate` with torch amp autocast context manager + self._peft_has_been_casted_to_bf16 = True + + # For models that use gradient_checkpointing, we need to attach a hook that enables input + # to explicitly have `requires_grad=True`, otherwise training will either silently + # fail or completely fail. + elif args.gradient_checkpointing: + # For backward compatibility with older versions of transformers + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + else: + + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + if args.generate_during_eval and not (is_wandb_available() or is_comet_available()): + raise ValueError( + "`generate_during_eval=True` requires Weights and Biases or Comet to be installed." + " Please install `wandb` or `comet-ml` to resolve." + ) + + if model is not None: + self.is_encoder_decoder = model.config.is_encoder_decoder + elif args.is_encoder_decoder is None: + raise ValueError("When no model is provided, you need to pass the parameter is_encoder_decoder.") + else: + self.is_encoder_decoder = args.is_encoder_decoder + + self.is_peft_model = is_peft_available() and isinstance(model, PeftModel) + self.model_adapter_name = model_adapter_name + self.ref_adapter_name = ref_adapter_name + + if ref_model: + self.ref_model = ref_model + elif self.is_peft_model or args.precompute_ref_log_probs: + # The `model` with adapters turned off will be used as the reference model + self.ref_model = None + else: + self.ref_model = create_reference_model(model) + + if processing_class is None: + raise ValueError( + "max_length or a processing_class must be specified when using the default DPODataCollatorWithPadding" + ) + if args.max_length is None: + warnings.warn( + "When using DPODataCollatorWithPadding, you should set `max_length` in the `BCOConfig`. " + "It will be set to `512` by default, but you should do it yourself in the future.", + UserWarning, + ) + max_length = 512 + if args.max_length is not None: + max_length = args.max_length + + if args.max_prompt_length is None: + warnings.warn( + "When using DPODataCollatorWithPadding, you should set `max_prompt_length` in the `BCOConfig`. " + "It will be set to `128` by default, but you should do it yourself in the future.", + UserWarning, + ) + max_prompt_length = 128 + if args.max_prompt_length is not None: + max_prompt_length = args.max_prompt_length + + max_completion_length = None + if args.max_completion_length is None and self.is_encoder_decoder: + warnings.warn( + "When using DPODataCollatorWithPadding with an encoder decoder architecture, you should set `max_completion_length` in the BCOTrainer's init" + " it will be set to `128` by default, but you should do it yourself in the future.", + UserWarning, + ) + max_completion_length = 128 + if args.max_completion_length is not None and self.is_encoder_decoder: + max_completion_length = args.max_completion_length + + if data_collator is None: + data_collator = DPODataCollatorWithPadding( + pad_token_id=processing_class.pad_token_id, + label_pad_token_id=args.label_pad_token_id, + is_encoder_decoder=self.is_encoder_decoder, + ) + + if args.remove_unused_columns: + args.remove_unused_columns = False + # warn users + warnings.warn( + "When using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your BCOConfig" + " we have set it for you, but you should do it yourself in the future.", + UserWarning, + ) + + self.use_dpo_data_collator = True + else: + self.use_dpo_data_collator = False + + # Disable dropout in the model and reference model + if args.disable_dropout: + disable_dropout_in_model(model) + if self.ref_model is not None: + disable_dropout_in_model(self.ref_model) + + self.max_length = max_length + self.generate_during_eval = args.generate_during_eval + self.label_pad_token_id = args.label_pad_token_id + self.padding_value = args.padding_value if args.padding_value is not None else processing_class.pad_token_id + self.max_prompt_length = max_prompt_length + self.truncation_mode = args.truncation_mode + self.max_completion_length = max_completion_length + self.precompute_ref_log_probs = args.precompute_ref_log_probs + + # Since ref_logs are precomputed on the first call to get_train/eval_dataloader + # keep track of first called to avoid computation of future calls + self._precomputed_train_ref_log_probs = False + self._precomputed_eval_ref_log_probs = False + + # metric + self._stored_metrics = defaultdict(lambda: defaultdict(list)) + + # BCO parameter + self.beta = args.beta + self.aux_loss_enabled = getattr(model.config, "output_router_logits", False) + self.aux_loss_coef = getattr(model.config, "router_aux_loss_coef", 0.0) + if self.aux_loss_enabled and self.aux_loss_coef == 0.0: + warnings.warn( + "You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to " + "`0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value " + "greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary " + "loss.", + UserWarning, + ) + + # Underlying Distribution Matching argument + self.embedding_func = embedding_func + self.embedding_tokenizer = embedding_tokenizer + + # The trainer estimates the number of FLOPs [floating-point operations] using the number of elements in the + # input tensor associated with the key "input_ids". However, in BCO, the sampled data does not include the + # "input_ids" key. Instead, the available keys are "prompt_input_ids" and "completion_input_ids". As a result, + # the trainer issues the warning: "Could not estimate the number of tokens of the input, floating-point + # operations will not be computed." To suppress this warning, we set the "estimate_tokens" key in the model's + # "warnings_issued" dictionary to True. This acts as a flag to indicate that the warning has already been + # issued. + model.warnings_issued["estimate_tokens"] = True + + with PartialState().main_process_first(): + # Apply the chat template if needed + train_dataset = train_dataset.map( + maybe_apply_chat_template, fn_kwargs={"tokenizer": processing_class}, num_proc=args.dataset_num_proc + ) + if eval_dataset is not None: + eval_dataset = eval_dataset.map( + maybe_apply_chat_template, + fn_kwargs={"tokenizer": processing_class}, + num_proc=args.dataset_num_proc, + ) + + # Tokenize and prepare the training datasets + train_dataset = train_dataset.map( + _tokenize, + batched=True, + fn_kwargs={"tokenizer": processing_class, "embedding_tokenizer": self.embedding_tokenizer}, + num_proc=args.dataset_num_proc, + desc="Tokenizing train dataset", + ) + + # Prepare the datasets + fn_kwargs = { + "prefix": "", + "is_encoder_decoder": self.is_encoder_decoder, + "tokenizer": processing_class, + "max_length": self.max_length, + "truncation_mode": self.truncation_mode, + "label_pad_token_id": self.label_pad_token_id, + "max_prompt_length": self.max_prompt_length, + "max_completion_length": self.max_completion_length, + } + train_dataset = train_dataset.map( + _process_tokens, + fn_kwargs=fn_kwargs, + num_proc=args.dataset_num_proc, + desc="Processing tokenized train dataset", + ) + + if eval_dataset is not None: + # Tokenize + eval_dataset = eval_dataset.map( + _tokenize, + fn_kwargs={"tokenizer": processing_class, "embedding_tokenizer": self.embedding_tokenizer}, + batched=True, + num_proc=args.dataset_num_proc, + desc="Tokenizing eval dataset", + ) + + # Process + fn_kwargs = { + "prefix": "", + "is_encoder_decoder": self.is_encoder_decoder, + "tokenizer": processing_class, + "max_length": self.max_length, + "truncation_mode": self.truncation_mode, + "label_pad_token_id": self.label_pad_token_id, + "max_prompt_length": self.max_prompt_length, + "max_completion_length": self.max_completion_length, + } + eval_dataset = eval_dataset.map( + _process_tokens, + fn_kwargs=fn_kwargs, + num_proc=args.dataset_num_proc, + desc="Processing tokenized eval dataset", + ) + + desirable = train_dataset.filter( + lambda x: x["label"], num_proc=args.dataset_num_proc, desc="Filtering desirable examples" + ) + undesirable = train_dataset.filter( + lambda x: not x["label"], num_proc=args.dataset_num_proc, desc="Filtering undesirable examples" + ) + + super().__init__( + model=model, + args=args, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + processing_class=processing_class, + model_init=model_init, + compute_metrics=compute_metrics, + callbacks=callbacks, + optimizers=optimizers, + preprocess_logits_for_metrics=preprocess_logits_for_metrics, + ) + + # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the + # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set + # self.model_accepts_loss_kwargs to False to enable scaling. + self.model_accepts_loss_kwargs = False + + # Add tags for models that have been loaded with the correct transformers version + if hasattr(self.model, "add_model_tags"): + self.model.add_model_tags(self._tag_names) + + if not hasattr(self, "accelerator"): + raise AttributeError( + "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`." + ) + + # Deepspeed Zero-3 does not support precompute_ref_log_probs + if self.is_deepspeed_enabled: + if self.accelerator.state.deepspeed_plugin.zero_stage == 3 and self.precompute_ref_log_probs: + raise ValueError( + "You cannot use `precompute_ref_log_probs=True` with Deepspeed ZeRO-3. Please set `precompute_ref_log_probs=False`." + ) + + if self.ref_model is None: + if not (self.is_peft_model or self.precompute_ref_log_probs): + raise ValueError( + "No reference model and model is not a Peft model. Try setting `precompute_ref_log_probs=True`" + ) + else: + if self.is_deepspeed_enabled: + self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator) + else: + self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True) + + self.running = RunningMoments(accelerator=self.accelerator) + + if self.embedding_func is None or args.resume_from_checkpoint: + return + + chosen_embeddings = self._get_sample_prompt_embeddings(desirable, sample_size=self.args.prompt_sample_size) + rejected_embeddings = self._get_sample_prompt_embeddings(undesirable, sample_size=self.args.prompt_sample_size) + + embeddings = torch.cat((chosen_embeddings, rejected_embeddings), dim=0) + labels = torch.cat( + (torch.ones_like(chosen_embeddings[:, 0]), torch.zeros_like(rejected_embeddings[:, 0])), dim=0 + ) + + self.clf = LogisticRegression(class_weight="balanced").fit( + embeddings.cpu().float().numpy(), labels.cpu().numpy() + ) + chosen_mean = self.clf.score( + chosen_embeddings.cpu().float().numpy(), torch.ones_like(chosen_embeddings[:, 0]).cpu().numpy() + ) + rejected_mean = self.clf.score( + rejected_embeddings.cpu().float().numpy(), torch.zeros_like(rejected_embeddings[:, 0]).cpu().numpy() + ) + logger.info(f"UDM classifier training scores: chosen: {chosen_mean}, rejected: {rejected_mean}") + + @property + def match_underlying_distribution(self): + return self.embedding_func is not None and self.embedding_tokenizer is not None + + def _get_chosen_prob(self, prompt_embeddings: torch.FloatTensor) -> torch.FloatTensor: + """ + Calculates the probability if the given prompt embedding is from desirable dataset. This function calculates + the probability in the process and ensemble across processes. + """ + dtype = prompt_embeddings.dtype + device = prompt_embeddings.device + rank = self.accelerator.process_index + + padded_prompt_embeddings = self.accelerator.pad_across_processes( + prompt_embeddings, pad_index=self.embedding_tokenizer.pad_token_id + ) + sample_size = padded_prompt_embeddings.shape[0] + nonzero = padded_prompt_embeddings.mean(dim=1) != self.embedding_tokenizer.pad_token_id + prompt_embeddings = self.accelerator.gather(padded_prompt_embeddings) + + # cannot predict for all empty values + if prompt_embeddings.shape[0] == 0: + return torch.tensor([], device=device, dtype=dtype) + + prob = self.clf.predict_proba(prompt_embeddings.cpu().float().numpy())[:, 1] + prob = torch.as_tensor(prob, dtype=dtype, device=device) + prob = self.accelerator.reduce(prob, reduction="mean") + + prob = prob[sample_size * rank : sample_size * (rank + 1)] + prob = prob[nonzero] + + return prob + + def _vectorize_prompt(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor) -> torch.FloatTensor: + """ + Replaces processing_class.pad_token_id to embedding_tokenizer.pad_token_id and applies self.embedding_func + """ + input_ids = torch.where( + input_ids == self.processing_class.pad_token_id, + self.embedding_tokenizer.pad_token_id, + input_ids, + ) + + with torch.no_grad(): + embeddings = self.embedding_func( + input_ids=input_ids, + attention_mask=attention_mask, + ) + + return embeddings + + def _get_prompt_embeddings( + self, batch: dict[str, Union[list, torch.LongTensor]] + ) -> tuple[torch.FloatTensor, torch.FloatTensor]: + """Extract embeddings from frozen embedding model""" + + if not self.match_underlying_distribution: + return None, None + + embeddings = self._vectorize_prompt( + input_ids=batch["embedding_input_ids"], + attention_mask=batch["embedding_attention_mask"], + ) + + chosen_idx = [i for i in range(len(batch["label"])) if batch["label"][i] is True] + rejected_idx = [i for i in range(len(batch["label"])) if batch["label"][i] is False] + + chosen_embeddings = embeddings[chosen_idx, ...] + rejected_embeddings = embeddings[rejected_idx, ...] + + return (chosen_embeddings, rejected_embeddings) + + def _get_sample_prompt_embeddings(self, dataset: Dataset, sample_size: int = 512) -> torch.FloatTensor: + """ + Sample instances from dataset and get prompt embeddings. Used for density ratio classifier training. + """ + n_samples = min(len(dataset), sample_size) + rand_indices = np.random.choice(len(dataset), size=(n_samples,)) + + embedding_dataset = dataset.select(rand_indices) + + dataloader_params = { + "batch_size": self.args.per_device_train_batch_size, + "collate_fn": self.data_collator, + "num_workers": self.args.dataloader_num_workers, + "pin_memory": self.args.dataloader_pin_memory, + "shuffle": False, + } + + # prepare dataloader + data_loader = self.accelerator.prepare(DataLoader(embedding_dataset, **dataloader_params)) + + with torch.no_grad(): + all_embeddings = torch.empty(0) + for padded_batch in tqdm(iterable=data_loader, desc="Building sample prompt embeddings"): + embeddings = self._vectorize_prompt( + input_ids=padded_batch["embedding_input_ids"], + attention_mask=padded_batch["embedding_attention_mask"], + ) + embeddings = self.accelerator.gather_for_metrics(embeddings) + all_embeddings = torch.cat((all_embeddings, embeddings.cpu())) + + return all_embeddings + + def _save_optimizer_and_scheduler(self, output_dir): + output_dir = output_dir if output_dir is not None else self.args.output_dir + super()._save_optimizer_and_scheduler(output_dir) + + if self.accelerator.is_main_process: + # When saving optimizer and scheduler to checkpoint, save also the running delta object. + self.running.save_to_json(os.path.join(output_dir, RUNNING_NAME)) + + if self.match_underlying_distribution: + joblib.dump(self.clf, os.path.join(output_dir, CLF_NAME), compress=True) + + def _load_optimizer_and_scheduler(self, checkpoint): + if checkpoint is None: + logger.warning_once(f"Missing Checkpoint {checkpoint}") + return + + super()._load_optimizer_and_scheduler(checkpoint) + + # when loading optimizer and scheduler from checkpoint, also load the running delta object. + running_file = os.path.join(checkpoint, RUNNING_NAME) + if os.path.isfile(running_file): + self.running = RunningMoments.load_from_json(self.accelerator, running_file) + + if self.match_underlying_distribution: + clf_file = os.path.join(checkpoint, CLF_NAME) + if os.path.isfile(clf_file): + self.clf = joblib.load(clf_file) + + @contextmanager + def null_ref_context(self): + """Context manager for handling null reference model (that is, peft adapter manipulation).""" + with ( + self.accelerator.unwrap_model(self.model).disable_adapter() + if self.is_peft_model and not self.ref_adapter_name + else nullcontext() + ): + if self.ref_adapter_name: + self.model.set_adapter(self.ref_adapter_name) + yield + if self.ref_adapter_name: + self.model.set_adapter(self.model_adapter_name or "default") + + def get_train_dataloader(self) -> DataLoader: + """ + Returns the training [`~torch.utils.data.DataLoader`]. + + Subclass of transformers.src.transformers.trainer.get_train_dataloader to precompute `ref_log_probs`. + """ + + if self.precompute_ref_log_probs and not self._precomputed_train_ref_log_probs: + dataloader_params = { + "batch_size": self.args.per_device_train_batch_size, + "collate_fn": self.data_collator, + "num_workers": self.args.dataloader_num_workers, + "pin_memory": self.args.dataloader_pin_memory, + "shuffle": False, + } + + # prepare dataloader + data_loader = self.accelerator.prepare(DataLoader(self.train_dataset, **dataloader_params)) + reference_completion_logps = [] + + for padded_batch in tqdm(iterable=data_loader, desc="Train dataset reference log probs"): + reference_completion_logp = self.compute_reference_log_probs(padded_batch) + + reference_completion_logp = self.accelerator.gather_for_metrics(reference_completion_logp) + reference_completion_logps.append(reference_completion_logp.cpu()) + + self.train_dataset = self.train_dataset.add_column( + name="reference_logps", column=torch.cat(reference_completion_logps).float().numpy() + ) + + self._precomputed_train_ref_log_probs = True + + return super().get_train_dataloader() + + def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader: + """ + Returns the evaluation [`~torch.utils.data.DataLoader`]. + + Subclass of transformers.src.transformers.trainer.get_eval_dataloader to precompute `ref_log_probs`. + + Args: + eval_dataset (`torch.utils.data.Dataset`, *optional*): + If provided, will override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns not accepted + by the `model.forward()` method are automatically removed. It must implement `__len__`. + """ + if eval_dataset is None and self.eval_dataset is None: + raise ValueError("Trainer: evaluation requires an eval_dataset.") + eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset + + if self.precompute_ref_log_probs and not self._precomputed_eval_ref_log_probs: + dataloader_params = { + "batch_size": self.args.per_device_eval_batch_size, + "collate_fn": self.data_collator, + "num_workers": self.args.dataloader_num_workers, + "pin_memory": self.args.dataloader_pin_memory, + "shuffle": False, + } + + # prepare dataloader + data_loader = self.accelerator.prepare(DataLoader(eval_dataset, **dataloader_params)) + + reference_completion_logps = [] + + for padded_batch in tqdm(iterable=data_loader, desc="Eval dataset reference log probs"): + reference_completion_logp = self.compute_reference_log_probs(padded_batch) + + reference_completion_logp = self.accelerator.gather_for_metrics(reference_completion_logp) + reference_completion_logps.append(reference_completion_logp.cpu()) + + eval_dataset = eval_dataset.add_column( + name="reference_logps", column=torch.cat(reference_completion_logps).float().numpy() + ) + + # Save calculated reference_chosen_logps and reference_rejected_logps to the eval_dataset for subsequent runs + if self.eval_dataset is not None: + self.eval_dataset = eval_dataset + self._precomputed_eval_ref_log_probs = True + + return super().get_eval_dataloader(eval_dataset=eval_dataset) + + def compute_reference_log_probs(self, padded_batch: dict) -> dict: + """Computes log probabilities of the reference model for a single padded batch of a BCO specific dataset.""" + with torch.no_grad(): + if self.ref_model is None: + with self.null_ref_context(): + if self.is_encoder_decoder: + completion_logits = self.model( + padded_batch["prompt_input_ids"], + attention_mask=padded_batch["prompt_attention_mask"], + decoder_input_ids=padded_batch.get("completion_decoder_input_ids"), + labels=padded_batch["completion_labels"], + ).logits + + else: + completion_logits = self.model( + padded_batch["completion_input_ids"], + attention_mask=padded_batch["completion_attention_mask"], + ).logits + + else: + if self.is_encoder_decoder: + completion_logits = self.ref_model( + padded_batch["prompt_input_ids"], + attention_mask=padded_batch["prompt_attention_mask"], + decoder_input_ids=padded_batch.get("completion_decoder_input_ids"), + labels=padded_batch["completion_labels"], + ).logits + + else: + completion_logits = self.ref_model( + padded_batch["completion_input_ids"], attention_mask=padded_batch["completion_attention_mask"] + ).logits + + completion_logps = self.get_batch_logps( + completion_logits, + padded_batch["completion_labels"], + average_log_prob=False, + is_encoder_decoder=self.is_encoder_decoder, + label_pad_token_id=self.label_pad_token_id, + ) + + return completion_logps + + @staticmethod + def get_batch_logps( + logits: torch.FloatTensor, + labels: torch.LongTensor, + average_log_prob: bool = False, + label_pad_token_id: int = -100, + is_encoder_decoder: bool = False, + ) -> torch.FloatTensor: + """Compute the log probabilities of the given labels under the given logits. + + Args: + logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size) + labels: + Labels for which to compute the log probabilities. Label tokens with a value of label_pad_token_id are + ignored. Shape: (batch_size, sequence_length) + average_log_prob: + If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the + log probabilities of the (non-masked) tokens. + + Returns: + A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the + given logits. + """ + if logits.shape[:-1] != labels.shape: + raise ValueError("Logits (batch and sequence length dim) and labels must have the same shape.") + + if not is_encoder_decoder: + labels = labels[:, 1:].clone() + logits = logits[:, :-1, :] + else: + # Fixes end-dec RuntimeError + labels = labels.clone() + + loss_mask = labels != label_pad_token_id + + # dummy token; we'll ignore the losses on these tokens later + labels[labels == label_pad_token_id] = 0 + + per_token_logps = selective_log_softmax(logits, labels) + + if average_log_prob: + return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1) + else: + return (per_token_logps * loss_mask).sum(-1) + + def forward( + self, model: nn.Module, batch: dict[str, Union[list, torch.LongTensor]] + ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: + model_kwargs = ( + { + "labels": batch["completion_labels"], + "decoder_input_ids": batch.get("completion_decoder_input_ids"), + } + if self.is_encoder_decoder + else {} + ) + if self.aux_loss_enabled: + model_kwargs["output_router_logits"] = True + + outputs = model( + batch["completion_input_ids"], + attention_mask=batch["completion_attention_mask"], + **model_kwargs, + ) + completion_logits = outputs.logits + + completion_logps = self.get_batch_logps( + completion_logits, + batch["completion_labels"], + average_log_prob=False, + is_encoder_decoder=self.is_encoder_decoder, + label_pad_token_id=self.label_pad_token_id, + ) + + if completion_logps.shape[0] != len(batch["label"]): + raise ValueError( + "There is a mismatch between the number of examples in this batch and the number of " + "examples for which an output sequence was predicted." + ) + + chosen_idx = [i for i in range(completion_logps.shape[0]) if batch["label"][i] is True] + rejected_idx = [i for i in range(completion_logps.shape[0]) if batch["label"][i] is False] + + chosen_logps = completion_logps[chosen_idx, ...] + rejected_logps = completion_logps[rejected_idx, ...] + + chosen_logits = completion_logits[chosen_idx, ...] + rejected_logits = completion_logits[rejected_idx, ...] + + if self.aux_loss_enabled: + return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, outputs.aux_loss) + else: + return (chosen_logps, rejected_logps, chosen_logits, rejected_logits) + + def _get_udm_weight(self, rejected_embeddings: torch.FloatTensor) -> torch.FloatTensor: + prob_desirable = self._get_chosen_prob(rejected_embeddings) + min_ratio = self.args.min_density_ratio + max_ratio = self.args.max_density_ratio + + weight = (prob_desirable / (1 - prob_desirable + 1e-8)).clamp(min=min_ratio, max=max_ratio) + + return weight + + def bco_loss( + self, + policy_chosen_logps: torch.FloatTensor, + policy_rejected_logps: torch.FloatTensor, + reference_chosen_logps: torch.FloatTensor, + reference_rejected_logps: torch.FloatTensor, + chosen_embeddings: Optional[torch.FloatTensor], + rejected_embeddings: Optional[torch.FloatTensor], + do_train: bool = True, + ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: + """Compute the BCO loss for a batch of policy and reference model log probabilities. + + Args: + policy_chosen_logps: + Log probabilities of the policy model for the chosen responses. Shape: (num(chosen) in batch_size,) + policy_rejected_logps: + Log probabilities of the policy model for the rejected responses. Shape: (num(rejected) in batch_size,) + reference_chosen_logps: + Log probabilities of the reference model for the chosen responses. Shape: (num(chosen) in batch_size,) + reference_rejected_logps: + Log probabilities of the reference model for the rejected responses. Shape: (num(rejected) in + batch_size,) + chosen_embeddings: embeddings of desirable prompts + rejected_embeddings: embeddings of undesirable prompts + + Returns: + A tuple of four tensors: (losses, chosen_rewards, rejected_rewards, delta). The losses tensor contains the + BCO loss for each example in the batch. The chosen_rewards and rejected_rewards tensors contain the rewards + for the chosen and rejected responses, respectively. The delta value contains the moving average of all + implicit rewards. + """ + + chosen_logratios = policy_chosen_logps - reference_chosen_logps + chosen_rewards = self.beta * chosen_logratios + + rejected_logratios = policy_rejected_logps - reference_rejected_logps + rejected_rewards = self.beta * rejected_logratios + + if do_train: + self.running.update(torch.cat((chosen_rewards, rejected_rewards), 0).detach()) + delta = torch.as_tensor(self.running.mean, device=chosen_rewards.device) + + chosen_losses = -F.logsigmoid(chosen_rewards - delta) + rejected_losses = -F.logsigmoid(-(rejected_rewards - delta)) + + if self.match_underlying_distribution: + chosen_weight = torch.ones_like(chosen_losses) + rejected_weight = self._get_udm_weight(rejected_embeddings) + + losses = torch.cat((chosen_weight * chosen_losses, rejected_weight * rejected_losses), dim=0) + else: + losses = torch.cat((chosen_losses, rejected_losses), dim=0) + + return losses, chosen_rewards, rejected_rewards, delta + + def get_batch_loss_metrics( + self, + model, + batch: dict[str, Union[list, torch.LongTensor]], + do_train: bool = True, + ): + """Compute the BCO loss and other metrics for the given batch of inputs for train or test.""" + metrics = {} + batch = {k: (v.to(self.accelerator.device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()} + + forward_output = self.forward(model, batch) + ( + policy_chosen_logps, + policy_rejected_logps, + policy_chosen_logits, + policy_rejected_logits, + ) = forward_output[:4] + if self.aux_loss_enabled: + aux_loss = forward_output[4] + + # if reference_logps in batch use them, otherwise use the reference model + if "reference_logps" in batch: + chosen_idx = [i for i in range(batch["reference_logps"].shape[0]) if batch["label"][i] is True] + rejected_idx = [i for i in range(batch["reference_logps"].shape[0]) if batch["label"][i] is False] + + reference_chosen_logps = batch["reference_logps"][chosen_idx, ...] + reference_rejected_logps = batch["reference_logps"][rejected_idx, ...] + else: + with torch.no_grad(): + if self.ref_model is None: + with self.null_ref_context(): + ( + reference_chosen_logps, + reference_rejected_logps, + _, + _, + ) = self.forward(self.model, batch)[:4] + else: + ( + reference_chosen_logps, + reference_rejected_logps, + _, + _, + ) = self.forward(self.ref_model, batch)[:4] + + chosen_embeddings, rejected_embeddings = self._get_prompt_embeddings(batch) + + losses, chosen_rewards, rejected_rewards, delta = self.bco_loss( + policy_chosen_logps, + policy_rejected_logps, + reference_chosen_logps, + reference_rejected_logps, + chosen_embeddings, + rejected_embeddings, + do_train=do_train, + ) + metrics["delta"] = self.accelerator.gather_for_metrics(delta).mean().item() + + num_chosen = torch.Tensor([len(chosen_rewards)]).to(self.accelerator.device) + num_rejected = torch.Tensor([len(rejected_rewards)]).to(self.accelerator.device) + + all_num_chosen = self.accelerator.gather_for_metrics(num_chosen).sum().item() + all_num_rejected = self.accelerator.gather_for_metrics(num_rejected).sum().item() + + if all_num_chosen > 0: + metrics["rewards/chosen_sum"] = ( + self.accelerator.gather_for_metrics(chosen_rewards.nansum()).nansum().item() + ) + metrics["logps/chosen_sum"] = ( + self.accelerator.gather_for_metrics(policy_chosen_logps.nansum()).nansum().item() + ) + metrics["logits/chosen_sum"] = ( + self.accelerator.gather_for_metrics(policy_chosen_logits.nansum()).nansum().item() + ) + metrics["count/chosen"] = all_num_chosen + + if all_num_rejected > 0: + metrics["rewards/rejected_sum"] = ( + self.accelerator.gather_for_metrics(rejected_rewards.nansum()).nansum().item() + ) + metrics["logps/rejected_sum"] = ( + self.accelerator.gather_for_metrics(policy_rejected_logps.nansum()).nansum().item() + ) + metrics["logits/rejected_sum"] = ( + self.accelerator.gather_for_metrics(policy_rejected_logits.nansum()).nansum().item() + ) + metrics["count/rejected"] = all_num_rejected + + loss = losses.nanmean() + if self.aux_loss_enabled: + loss += self.aux_loss_coef * aux_loss + + return loss, metrics + + def compute_loss( + self, + model: Union[PreTrainedModel, nn.Module], + inputs: dict[str, Union[torch.Tensor, Any]], + return_outputs=False, + num_items_in_batch=None, + ) -> Union[torch.Tensor, tuple[torch.Tensor, dict[str, torch.Tensor]]]: + compute_loss_context_manager = ( + autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext() + ) + + with compute_loss_context_manager: + loss, metrics = self.get_batch_loss_metrics(model, inputs) + + # Make sure to move the loss to the device the original accumulating loss is at back in the `Trainer` class: + loss = loss.to(self.args.device) + # force log the metrics + if self.accelerator.is_main_process: + self.store_metrics(metrics, train_eval="train") + + if return_outputs: + return (loss, metrics) + return loss + + def store_metrics(self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None: + for key, value in metrics.items(): + self._stored_metrics[train_eval][key].append(value) + + def _get_train_sampler(self, dataset: Optional[Dataset] = None) -> Optional[torch.utils.data.Sampler]: + if dataset is None: + dataset = self.train_dataset + if dataset is None or not has_length(dataset): + return None + return SequentialSampler(dataset) + + def generate_from_model_and_ref(self, model, batch: dict[str, torch.LongTensor]) -> tuple[str, str]: + """Generate samples from the model and reference model for the given batch of inputs.""" + + # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with + # the torch amp context manager as some hidden states are silently casted to full precision. + generate_context_manager = ( + autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext() + ) + with generate_context_manager: + policy_output = model.generate( + input_ids=batch["prompt_input_ids"], + attention_mask=batch["prompt_attention_mask"], + max_length=self.max_length, + do_sample=True, + pad_token_id=self.processing_class.pad_token_id, + ) + + # if reference_output in batch use that otherwise use the reference model + if "reference_output" in batch: + reference_output = batch["reference_output"] + else: + if self.ref_model is None: + with self.null_ref_context(): + reference_output = self.model.generate( + input_ids=batch["prompt_input_ids"], + attention_mask=batch["prompt_attention_mask"], + max_length=self.max_length, + do_sample=True, + pad_token_id=self.processing_class.pad_token_id, + ) + else: + reference_output = self.ref_model.generate( + input_ids=batch["prompt_input_ids"], + attention_mask=batch["prompt_attention_mask"], + max_length=self.max_length, + do_sample=True, + pad_token_id=self.processing_class.pad_token_id, + ) + + policy_output = pad_to_length(policy_output, self.max_length, self.processing_class.pad_token_id) + policy_output_decoded = self.processing_class.batch_decode(policy_output, skip_special_tokens=True) + + reference_output = pad_to_length(reference_output, self.max_length, self.processing_class.pad_token_id) + reference_output_decoded = self.processing_class.batch_decode(reference_output, skip_special_tokens=True) + + return policy_output_decoded, reference_output_decoded + + def prediction_step( + self, + model: Union[PreTrainedModel, nn.Module], + inputs: dict[str, Union[torch.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[list[str]] = None, + ): + if ignore_keys is None: + if hasattr(model, "config"): + ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", []) + else: + ignore_keys = [] + + prediction_context_manager = ( + autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext() + ) + with torch.no_grad(), prediction_context_manager: + loss, metrics = self.get_batch_loss_metrics(model, inputs, do_train=False) + + # force log the metrics + if self.accelerator.is_main_process: + self.store_metrics(metrics, train_eval="eval") + + if prediction_loss_only: + return (loss.detach(), None, None) + + # logits for the chosen and rejected samples from model + logits_dict = {} + if "logits/chosen_sum" in metrics: + logits_dict["eval_logits/chosen"] = metrics["logits/chosen_sum"] + if "logits/rejected_sum" in metrics: + logits_dict["eval_logits/rejected"] = metrics["logits/rejected_sum"] + logits = [v for k, v in logits_dict.items() if k not in ignore_keys] + logits = torch.tensor(logits, device=self.accelerator.device) + labels = torch.zeros(logits.shape[0], device=self.accelerator.device) + + return (loss.detach(), logits, labels) + + def evaluation_loop( + self, + dataloader: DataLoader, + description: str, + prediction_loss_only: Optional[bool] = None, + ignore_keys: Optional[list[str]] = None, + metric_key_prefix: str = "eval", + ) -> EvalLoopOutput: + """ + Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by + `Trainer.evaluate()` and `Trainer.predict()`. + + Works both with or without labels. + """ + + # Sample and save to game log if requested (for one batch to save time) + if self.generate_during_eval: + # Generate random indices within the range of the total number of samples + num_samples = len(dataloader.dataset) + random_indices = random.sample(range(num_samples), k=self.args.eval_batch_size) + + # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader + random_batch_dataset = dataloader.dataset.select(random_indices) + random_batch = self.data_collator(random_batch_dataset) + random_batch = self._prepare_inputs(random_batch) + + target_indicies = [i for i in range(len(random_batch["label"])) if random_batch["label"][i] is False] + target_batch = { + "prompt_input_ids": random_batch["prompt_input_ids"][target_indicies], + "prompt_attention_mask": random_batch["prompt_attention_mask"][target_indicies], + "prompt": itemgetter(*target_indicies)(random_batch["prompt"]), + } + policy_output_decoded, ref_output_decoded = self.generate_from_model_and_ref(self.model, target_batch) + + table = pd.DataFrame( + columns=["Prompt", "Policy", "Ref Model"], + data=[ + [prompt, pol[len(prompt) :], ref[len(prompt) :]] + for prompt, pol, ref in zip(target_batch["prompt"], policy_output_decoded, ref_output_decoded) + ], + ) + if "wandb" in self.args.report_to: + wandb.log({"game_log": wandb.Table(data=table)}) + + if "comet_ml" in self.args.report_to: + log_table_to_comet_experiment( + name="game_log.csv", + table=table, + ) + + # Base evaluation + initial_output = super().evaluation_loop( + dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix + ) + + return initial_output + + def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None: + """ + Log `logs` on the various objects watching training, including stored metrics. + + Args: + logs (`dict[str, float]`): + The values to log. + start_time (`float` or `None`, *optional*, defaults to `None`): + Start time of the training. + """ + # logs either has 'loss' or 'eval_loss' + train_eval = "train" if "loss" in logs else "eval" + # train metrics should have no prefix, eval should have 'eval_' + prefix = "eval_" if train_eval == "eval" else "" + # accumulate average metrics from sums and lengths + for split in ["chosen", "rejected"]: + if f"count/{split}" in self._stored_metrics[train_eval]: + count_sum = torch.Tensor(self._stored_metrics[train_eval][f"count/{split}"]).sum().item() + for metric in ["rewards", "logps", "logits"]: + logs[f"{prefix}{metric}/{split}"] = ( + torch.Tensor(self._stored_metrics[train_eval][f"{metric}/{split}_sum"]).sum().item() + / count_sum + ) + # delete obsolete metric + del self._stored_metrics[train_eval][f"{metric}/{split}_sum"] + del self._stored_metrics[train_eval][f"count/{split}"] + # calculate reward margin + if f"{prefix}rewards/chosen" in logs and f"{prefix}rewards/rejected" in logs: + logs[f"{prefix}rewards/margins"] = logs[f"{prefix}rewards/chosen"] - logs[f"{prefix}rewards/rejected"] + # Add averaged stored metrics to logs + for key, metrics in self._stored_metrics[train_eval].items(): + logs[f"{prefix}{key}"] = torch.Tensor(metrics).mean().item() + del self._stored_metrics[train_eval] + return super().log(logs, start_time) + + # Ensure the model card is saved along with the checkpoint + def _save_checkpoint(self, model, trial): + if self.args.hub_model_id is None: + model_name = Path(self.args.output_dir).name + else: + model_name = self.args.hub_model_id.split("/")[-1] + self.create_model_card(model_name=model_name) + super()._save_checkpoint(model, trial) + + def create_model_card( + self, + model_name: Optional[str] = None, + dataset_name: Optional[str] = None, + tags: Union[str, list[str], None] = None, + ): + """ + Creates a draft of a model card using the information available to the `Trainer`. + + Args: + model_name (`str` or `None`, *optional*, defaults to `None`): + Name of the model. + dataset_name (`str` or `None`, *optional*, defaults to `None`): + Name of the dataset used for training. + tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`): + Tags to be associated with the model card. + """ + if not self.is_world_process_zero(): + return + + if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path): + base_model = self.model.config._name_or_path + else: + base_model = None + + # normalize `tags` to a mutable set + if tags is None: + tags = set() + elif isinstance(tags, str): + tags = {tags} + else: + tags = set(tags) + + if hasattr(self.model.config, "unsloth_version"): + tags.add("unsloth") + + tags.update(self._tag_names) + + citation = textwrap.dedent("""\ + @article{jung2024binary, + title = {{Binary Classifier Optimization for Large Language Model Alignment}}, + author = {Seungjae Jung and Gunsoo Han and Daniel Wontae Nam and Kyoung{-}Woon On}, + year = 2024, + eprint = {arXiv:2404.04656} + }""") + + model_card = generate_model_card( + base_model=base_model, + model_name=model_name, + hub_model_id=self.hub_model_id, + dataset_name=dataset_name, + tags=tags, + wandb_url=wandb.run.get_url() if is_wandb_available() and wandb.run is not None else None, + comet_url=get_comet_experiment_url(), + trainer_name="BCO", + trainer_citation=citation, + paper_title="Binary Classifier Optimization for Large Language Model Alignment", + paper_id="2404.04656", + ) + + model_card.save(os.path.join(self.args.output_dir, "README.md")) +class UnslothBCOTrainer(_UnslothBCOTrainer): + """ + + Initialize BCOTrainer from [BCO](https://huggingface.co/papers/2404.04656) paper. + + Args: + model (`transformers.PreTrainedModel`): + The model to train, preferably an `AutoModelForSequenceClassification`. + ref_model (`PreTrainedModelWrapper`): + Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation + and loss. If no reference model is provided, the trainer will create a reference model with the same + architecture as the model to be optimized. + args (`BCOConfig`): + The arguments to use for training. + train_dataset (`datasets.Dataset`): + The dataset to use for training. + eval_dataset (`datasets.Dataset`): + The dataset to use for evaluation. + processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*): + Processing class used to process the data. If provided, will be used to automatically process the inputs + for the model, and it will be saved along the model to make it easier to rerun an interrupted training or + reuse the fine-tuned model. + data_collator (`transformers.DataCollator`, *optional*, defaults to `None`): + The data collator to use for training. If None is specified, the default data collator + (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the + sequences in the batch, given a dataset of paired sequences. + model_init (`Callable[[], transformers.PreTrainedModel]`): + The model initializer to use for training. If None is specified, the default model initializer will be + used. + callbacks (`list[transformers.TrainerCallback]`): + The callbacks to use for training. + optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`): + The optimizer and scheduler to use for training. + preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`): + The function to use to preprocess the logits before computing the metrics. + peft_config (`dict`, defaults to `None`): + The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in + a PEFT model. + compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*): + The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to + metric values. + model_adapter_name (`str`, defaults to `None`): + Name of the train target PEFT adapter, when using LoRA with multiple adapters. + ref_adapter_name (`str`, defaults to `None`): + Name of the reference PEFT adapter, when using LoRA with multiple adapters. + + """ + def __init__( + self, + model = None, + ref_model = None, + args = None, + train_dataset = None, + eval_dataset = None, + processing_class = None, + data_collator = None, + model_init = None, + callbacks = None, + preprocess_logits_for_metrics = None, + peft_config = None, + compute_metrics = None, + model_adapter_name = None, + ref_adapter_name = None, + embedding_func = None, + embedding_tokenizer = None, + **kwargs + ): + if args is None: args = UnslothBCOConfig() + use_bf16 = getattr(args, 'bf16', False) + if type(use_bf16) is not bool: use_bf16 = False + use_fp16 = getattr(args, 'fp16', False) + if type(use_fp16) is not bool: use_fp16 = False + force_float32 = False + if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1': + print('Unsloth: Switching to float32 training since model cannot work with float16') + force_float32 = True + mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') + dtype = getattr(model.config, 'torch_dtype', None) + if dtype is None: dtype = model.get_input_embeddings().dtype + from unsloth_zoo.utils import _get_dtype + dtype = _get_dtype(dtype) + float16 = dtype == torch.float16 + if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`') + if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`') + if force_float32: + args.fp16 = False + args.bf16 = False + os.environ['ACCELERATE_MIXED_PRECISION'] = 'no' + elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32': + args.fp16 = float16 + args.bf16 = not float16 + os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16' + if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no': + args.eval_strategy = 'steps' + if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1 + ga_steps = getattr(args, 'gradient_accumulation_steps', None) + if ga_steps is not None and ga_steps > 1: + from transformers import __version__ as transformers_version + if Version(transformers_version) <= Version('4.45.2'): + print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n' + '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`') + if getattr(args, 'eval_strategy', 'no') != 'no': + eval_bsz = getattr(args, 'per_device_eval_batch_size', 8) + if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size + if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps + fp16_full_eval = getattr(args, 'fp16_full_eval', False) + if type(fp16_full_eval) is not bool: fp16_full_eval = False + bf16_full_eval = getattr(args, 'bf16_full_eval', False) + if type(bf16_full_eval) is not bool: bf16_full_eval = False + if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True + if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False + if force_float32: + args.bf16_full_eval = False + args.fp16_full_eval = False + elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16': + args.bf16_full_eval = True + args.fp16_full_eval = False + elif not bf16_full_eval and not fp16_full_eval: + args.bf16_full_eval = args.bf16 + args.fp16_full_eval = args.fp16 + _output_logits = False + if locals().get('compute_metrics', None) is not None: _output_logits = True + if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True + if _output_logits: + os.environ['UNSLOTH_RETURN_LOGITS'] = '1' + if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'): + pass + else: + model_max_seq_length = getattr(model, 'max_seq_length', None) + args_max_seq_length = getattr(args, 'max_seq_length', None) + if args_max_seq_length is None and model_max_seq_length is not None: + max_seq_length = model.max_seq_length + if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length + if model is not None and hasattr(model, 'for_training'): + model.for_training() + if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right' + if 'processing_class' in locals(): + if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right' + if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right' + __tokenizer = processing_class if 'processing_class' in locals() else tokenizer + from unsloth_zoo.vision_utils import UnslothVisionDataCollator + if not isinstance(data_collator, UnslothVisionDataCollator): + if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer, mlm = False, mlm_probability = 0.0) + elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names: + data_collator = DataCollatorForSeq2Seq(__tokenizer) + else: + if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False + if hasattr(args, 'dataset_text_field'): args.dataset_text_field = '' + if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True} + if not isinstance(data_collator, UnslothVisionDataCollator): + if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'): + if isinstance(data_collator, DataCollatorForSeq2Seq): + data_collator = DataCollatorForSeq2Seq(__tokenizer.tokenizer) + else: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer.tokenizer, mlm = False, mlm_probability = 0.0) + other_metrics = [] + + from unsloth_zoo.logging_utils import PatchRLStatistics + PatchRLStatistics('bco_trainer', other_metrics) + + super().__init__( + model = model, + ref_model = ref_model, + args = args, + train_dataset = train_dataset, + eval_dataset = eval_dataset, + processing_class = processing_class, + data_collator = data_collator, + model_init = model_init, + callbacks = callbacks, + preprocess_logits_for_metrics = preprocess_logits_for_metrics, + peft_config = peft_config, + compute_metrics = compute_metrics, + model_adapter_name = model_adapter_name, + ref_adapter_name = ref_adapter_name, + embedding_func = embedding_func, + embedding_tokenizer = embedding_tokenizer,**kwargs) + if hasattr(self, 'neftune_hook_handle'): + self.neftune_hook_handle.remove() + if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle + if getattr(args, 'neftune_noise_alpha', None) is not None: + model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha + pass + +pass diff --git a/unsloth_compiled_cache/UnslothCPOTrainer.py b/unsloth_compiled_cache/UnslothCPOTrainer.py new file mode 100644 index 0000000000000000000000000000000000000000..884a61247780b2e5fbbafab51c7fbdb9b82ec0f7 --- /dev/null +++ b/unsloth_compiled_cache/UnslothCPOTrainer.py @@ -0,0 +1,1596 @@ +""" +2025.7.4 +2025.7.3 +4.53.2 +0.19.1 +__UNSLOTH_VERSIONING__ +""" +from torch import Tensor +import torch +import torch.nn as nn +from torch.nn import functional as F +from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable +from trl.trainer.cpo_trainer import (Any, AutoModelForCausalLM, BaseImageProcessor, CPOConfig, CPOTrainer, Callable, DPODataCollatorWithPadding, DataCollator, DataLoader, Dataset, EvalLoopOutput, F, FeatureExtractionMixin, Literal, Optional, PartialState, Path, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, Trainer, TrainerCallback, Union, add_bos_token_if_needed, add_eos_token_if_needed, autocast, defaultdict, disable_dropout_in_model, generate_model_card, get_comet_experiment_url, inspect, is_comet_available, is_peft_available, is_torch_fx_proxy, is_wandb_available, log_table_to_comet_experiment, maybe_apply_chat_template, maybe_extract_prompt, nn, np, nullcontext, os, pad_to_length, pd, peft_module_casting_to_bf16, prepare_model_for_kbit_training, random, textwrap, torch, wandb, warnings, F, Optional, PeftModel, PreTrainedModel, Trainer, is_peft_available, os, torch) + + +import os +from typing import * +from dataclasses import dataclass, field +from packaging.version import Version +import torch +import numpy as np +from contextlib import nullcontext +from torch.nn import functional as F +from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling + +torch_compile_options = { + "epilogue_fusion" : True, + "max_autotune" : False, + "shape_padding" : True, + "trace.enabled" : False, + "triton.cudagraphs" : False, +} + +@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) +def selective_log_softmax(logits, index): + logits = logits.to(torch.float32) + selected_logits = torch.gather(logits, dim = -1, index = index.unsqueeze(-1)).squeeze(-1) + # loop to reduce peak mem consumption + # logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits]) + logsumexp_values = torch.logsumexp(logits, dim = -1) + per_token_logps = selected_logits - logsumexp_values # log_softmax(x_i) = x_i - logsumexp(x) + return per_token_logps +@dataclass +class UnslothCPOConfig(CPOConfig): + """ + + Configuration class for the [`CPOTrainer`]. + + This class includes only the parameters that are specific to CPO training. For a full list of training arguments, + please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may + differ from those in [`~transformers.TrainingArguments`]. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + max_length (`int` or `None`, *optional*, defaults to `1024`): + Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want + to use the default data collator. + max_prompt_length (`int` or `None`, *optional*, defaults to `512`): + Maximum length of the prompt. This argument is required if you want to use the default data collator. + max_completion_length (`int` or `None`, *optional*, defaults to `None`): + Maximum length of the completion. This argument is required if you want to use the default data collator + and your model is an encoder-decoder. + beta (`float`, *optional*, defaults to `0.1`): + Parameter controlling the deviation from the reference model. Higher β means less deviation from the + reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in + the [paper](https://huggingface.co/papers/2310.12036). + label_smoothing (`float`, *optional*, defaults to `0.0`): + Label smoothing factor. This argument is required if you want to use the default data collator. + loss_type (`str`, *optional*, defaults to `"sigmoid"`): + Type of loss to use. Possible values are: + + - `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper. + - `"hinge"`: hinge loss on the normalized likelihood from the + [SLiC](https://huggingface.co/papers/2305.10425) paper. + - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper. + - `"simpo"`: SimPO loss from the [SimPO](https://huggingface.co/papers/2405.14734) paper. + + disable_dropout (`bool`, *optional*, defaults to `True`): + Whether to disable dropout in the model. + cpo_alpha (`float`, *optional*, defaults to `1.0`): + Weight of the BC regularizer in CPO training. + simpo_gamma (`float`, *optional*, defaults to `0.5`): + Target reward margin for the SimPO loss, used only when the `loss_type="simpo"`. + label_pad_token_id (`int`, *optional*, defaults to `-100`): + Label pad token id. This argument is required if you want to use the default data collator. + padding_value (`int` or `None`, *optional*, defaults to `None`): + Padding value to use. If `None`, the padding value of the tokenizer is used. + truncation_mode (`str`,*optional*, defaults to `"keep_end"`): + Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`. + This argument is required if you want to use the default data collator. + generate_during_eval (`bool`, *optional*, defaults to `False`): + If `True`, generates and logs completions from the model to W&B or Comet during evaluation. + is_encoder_decoder (`bool` or `None`, *optional*, defaults to `None`): + When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument, + you need to specify if the model returned by the callable is an encoder-decoder model. + model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): + Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a + string. + dataset_num_proc (`int` or `None`, *optional*, defaults to `None`): + Number of processes to use for processing the dataset. + + """ + vllm_sampling_params: Optional[Any] = field( + default = None, + metadata = {'help': 'vLLM SamplingParams'}, + ) + unsloth_num_chunks : Optional[int] = field( + default = -1, + metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'}, + ) + def __init__( + self, + output_dir = None, + overwrite_output_dir = None, + do_train = False, + do_eval = False, + do_predict = False, + eval_strategy = 'no', + prediction_loss_only = False, + per_device_train_batch_size = 4, + per_device_eval_batch_size = 4, + per_gpu_train_batch_size = None, + per_gpu_eval_batch_size = None, + gradient_accumulation_steps = 2, + eval_accumulation_steps = 2, + eval_delay = 0, + torch_empty_cache_steps = 250, + learning_rate = 5e-05, + weight_decay = 0.01, + adam_beta1 = 0.9, + adam_beta2 = 0.999, + adam_epsilon = 1e-08, + max_grad_norm = 1.0, + num_train_epochs = 3.0, + max_steps = -1, + lr_scheduler_type = 'linear', + warmup_ratio = 0.1, + warmup_steps = 0, + log_level = 'passive', + log_level_replica = 'warning', + log_on_each_node = True, + logging_dir = None, + logging_strategy = 'steps', + logging_first_step = False, + logging_steps = 1, + logging_nan_inf_filter = False, + save_strategy = 'steps', + save_steps = 500, + save_total_limit = None, + save_safetensors = True, + save_on_each_node = False, + save_only_model = False, + restore_callback_states_from_checkpoint = False, + no_cuda = False, + use_cpu = False, + use_mps_device = False, + seed = 3407, + data_seed = 3407, + jit_mode_eval = False, + use_ipex = False, + bf16 = False, + fp16 = False, + fp16_opt_level = 'O1', + half_precision_backend = 'auto', + bf16_full_eval = False, + fp16_full_eval = False, + tf32 = None, + local_rank = -1, + ddp_backend = None, + tpu_num_cores = None, + tpu_metrics_debug = False, + debug = '', + dataloader_drop_last = False, + eval_steps = None, + dataloader_num_workers = 0, + dataloader_prefetch_factor = None, + past_index = -1, + run_name = None, + disable_tqdm = None, + remove_unused_columns = True, + label_names = None, + load_best_model_at_end = False, + metric_for_best_model = None, + greater_is_better = None, + ignore_data_skip = False, + fsdp = '', + fsdp_min_num_params = 0, + fsdp_config = None, + fsdp_transformer_layer_cls_to_wrap = None, + accelerator_config = None, + deepspeed = None, + label_smoothing_factor = 0.0, + optim = 'adamw_8bit', + optim_args = None, + adafactor = False, + group_by_length = False, + length_column_name = 'length', + report_to = None, + ddp_find_unused_parameters = None, + ddp_bucket_cap_mb = None, + ddp_broadcast_buffers = None, + dataloader_pin_memory = True, + dataloader_persistent_workers = False, + skip_memory_metrics = True, + use_legacy_prediction_loop = False, + push_to_hub = False, + resume_from_checkpoint = None, + hub_model_id = None, + hub_strategy = 'every_save', + hub_token = None, + hub_private_repo = None, + hub_always_push = False, + hub_revision = None, + gradient_checkpointing = False, + gradient_checkpointing_kwargs = None, + include_inputs_for_metrics = False, + eval_do_concat_batches = True, + fp16_backend = 'auto', + push_to_hub_model_id = None, + push_to_hub_organization = None, + push_to_hub_token = None, + mp_parameters = '', + auto_find_batch_size = False, + full_determinism = False, + torchdynamo = None, + ray_scope = 'last', + ddp_timeout = 1800, + torch_compile = False, + torch_compile_backend = None, + torch_compile_mode = None, + include_tokens_per_second = False, + include_num_input_tokens_seen = False, + neftune_noise_alpha = None, + optim_target_modules = None, + batch_eval_metrics = False, + eval_on_start = False, + use_liger_kernel = False, + liger_kernel_config = None, + eval_use_gather_object = False, + average_tokens_across_devices = False, + max_length = 1024, + max_prompt_length = 512, + max_completion_length = None, + beta = 0.1, + label_smoothing = 0.0, + loss_type = 'sigmoid', + disable_dropout = True, + cpo_alpha = 1.0, + simpo_gamma = 0.5, + label_pad_token_id = -100, + padding_value = None, + truncation_mode = 'keep_end', + generate_during_eval = False, + is_encoder_decoder = None, + model_init_kwargs = None, + dataset_num_proc = None, + vllm_sampling_params = None, + unsloth_num_chunks = -1, + **kwargs, + ): + if learning_rate < 1e-7: raise FloatingPointError(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!') + if learning_rate > 1: raise OverflowError(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!') + if output_dir is None and save_strategy == 'steps' and save_steps == 500: + output_dir = 'unsloth_training_checkpoints' + save_strategy = 'no' + if dataset_num_proc is None: + from multiprocessing import cpu_count + dataset_num_proc = cpu_count() + + super().__init__( + output_dir = output_dir, + overwrite_output_dir = overwrite_output_dir, + do_train = do_train, + do_eval = do_eval, + do_predict = do_predict, + eval_strategy = eval_strategy, + prediction_loss_only = prediction_loss_only, + per_device_train_batch_size = per_device_train_batch_size, + per_device_eval_batch_size = per_device_eval_batch_size, + per_gpu_train_batch_size = per_gpu_train_batch_size, + per_gpu_eval_batch_size = per_gpu_eval_batch_size, + gradient_accumulation_steps = gradient_accumulation_steps, + eval_accumulation_steps = eval_accumulation_steps, + eval_delay = eval_delay, + torch_empty_cache_steps = torch_empty_cache_steps, + learning_rate = learning_rate, + weight_decay = weight_decay, + adam_beta1 = adam_beta1, + adam_beta2 = adam_beta2, + adam_epsilon = adam_epsilon, + max_grad_norm = max_grad_norm, + num_train_epochs = num_train_epochs, + max_steps = max_steps, + lr_scheduler_type = lr_scheduler_type, + warmup_ratio = warmup_ratio, + warmup_steps = warmup_steps, + log_level = log_level, + log_level_replica = log_level_replica, + log_on_each_node = log_on_each_node, + logging_dir = logging_dir, + logging_strategy = logging_strategy, + logging_first_step = logging_first_step, + logging_steps = logging_steps, + logging_nan_inf_filter = logging_nan_inf_filter, + save_strategy = save_strategy, + save_steps = save_steps, + save_total_limit = save_total_limit, + save_safetensors = save_safetensors, + save_on_each_node = save_on_each_node, + save_only_model = save_only_model, + restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint, + no_cuda = no_cuda, + use_cpu = use_cpu, + use_mps_device = use_mps_device, + seed = seed, + data_seed = data_seed, + jit_mode_eval = jit_mode_eval, + use_ipex = use_ipex, + bf16 = bf16, + fp16 = fp16, + fp16_opt_level = fp16_opt_level, + half_precision_backend = half_precision_backend, + bf16_full_eval = bf16_full_eval, + fp16_full_eval = fp16_full_eval, + tf32 = tf32, + local_rank = local_rank, + ddp_backend = ddp_backend, + tpu_num_cores = tpu_num_cores, + tpu_metrics_debug = tpu_metrics_debug, + debug = debug, + dataloader_drop_last = dataloader_drop_last, + eval_steps = eval_steps, + dataloader_num_workers = dataloader_num_workers, + dataloader_prefetch_factor = dataloader_prefetch_factor, + past_index = past_index, + run_name = run_name, + disable_tqdm = disable_tqdm, + remove_unused_columns = remove_unused_columns, + label_names = label_names, + load_best_model_at_end = load_best_model_at_end, + metric_for_best_model = metric_for_best_model, + greater_is_better = greater_is_better, + ignore_data_skip = ignore_data_skip, + fsdp = fsdp, + fsdp_min_num_params = fsdp_min_num_params, + fsdp_config = fsdp_config, + fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap, + accelerator_config = accelerator_config, + deepspeed = deepspeed, + label_smoothing_factor = label_smoothing_factor, + optim = optim, + optim_args = optim_args, + adafactor = adafactor, + group_by_length = group_by_length, + length_column_name = length_column_name, + report_to = report_to, + ddp_find_unused_parameters = ddp_find_unused_parameters, + ddp_bucket_cap_mb = ddp_bucket_cap_mb, + ddp_broadcast_buffers = ddp_broadcast_buffers, + dataloader_pin_memory = dataloader_pin_memory, + dataloader_persistent_workers = dataloader_persistent_workers, + skip_memory_metrics = skip_memory_metrics, + use_legacy_prediction_loop = use_legacy_prediction_loop, + push_to_hub = push_to_hub, + resume_from_checkpoint = resume_from_checkpoint, + hub_model_id = hub_model_id, + hub_strategy = hub_strategy, + hub_token = hub_token, + hub_private_repo = hub_private_repo, + hub_always_push = hub_always_push, + hub_revision = hub_revision, + gradient_checkpointing = gradient_checkpointing, + gradient_checkpointing_kwargs = gradient_checkpointing_kwargs, + include_inputs_for_metrics = include_inputs_for_metrics, + eval_do_concat_batches = eval_do_concat_batches, + fp16_backend = fp16_backend, + push_to_hub_model_id = push_to_hub_model_id, + push_to_hub_organization = push_to_hub_organization, + push_to_hub_token = push_to_hub_token, + mp_parameters = mp_parameters, + auto_find_batch_size = auto_find_batch_size, + full_determinism = full_determinism, + torchdynamo = torchdynamo, + ray_scope = ray_scope, + ddp_timeout = ddp_timeout, + torch_compile = torch_compile, + torch_compile_backend = torch_compile_backend, + torch_compile_mode = torch_compile_mode, + include_tokens_per_second = include_tokens_per_second, + include_num_input_tokens_seen = include_num_input_tokens_seen, + neftune_noise_alpha = neftune_noise_alpha, + optim_target_modules = optim_target_modules, + batch_eval_metrics = batch_eval_metrics, + eval_on_start = eval_on_start, + use_liger_kernel = use_liger_kernel, + liger_kernel_config = liger_kernel_config, + eval_use_gather_object = eval_use_gather_object, + average_tokens_across_devices = average_tokens_across_devices, + max_length = max_length, + max_prompt_length = max_prompt_length, + max_completion_length = max_completion_length, + beta = beta, + label_smoothing = label_smoothing, + loss_type = loss_type, + disable_dropout = disable_dropout, + cpo_alpha = cpo_alpha, + simpo_gamma = simpo_gamma, + label_pad_token_id = label_pad_token_id, + padding_value = padding_value, + truncation_mode = truncation_mode, + generate_during_eval = generate_during_eval, + is_encoder_decoder = is_encoder_decoder, + model_init_kwargs = model_init_kwargs, + dataset_num_proc = dataset_num_proc,**kwargs) + self.vllm_sampling_params = vllm_sampling_params + self.unsloth_num_chunks = unsloth_num_chunks +pass + +class _UnslothCPOTrainer(Trainer): + r"""""" + + _tag_names = ["trl", "cpo"] + + def __init__( + self, + model: Optional[Union[PreTrainedModel, nn.Module, str]] = None, + args: Optional[CPOConfig] = None, + data_collator: Optional[DataCollator] = None, + train_dataset: Optional[Dataset] = None, + eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None, + processing_class: Optional[ + Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin] + ] = None, + model_init: Optional[Callable[[], PreTrainedModel]] = None, + callbacks: Optional[list[TrainerCallback]] = None, + optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), + preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, + peft_config: Optional[dict] = None, + compute_metrics: Optional[Callable[[EvalLoopOutput], dict]] = None, + ): + if args.model_init_kwargs is None: + model_init_kwargs = {} + elif not isinstance(model, str): + raise ValueError("You passed model_kwargs to the CPOTrainer. But your model is already instantiated.") + else: + model_init_kwargs = args.model_init_kwargs + torch_dtype = model_init_kwargs.get("torch_dtype") + if torch_dtype is not None: + # Convert to `torch.dtype` if an str is passed + if isinstance(torch_dtype, str) and torch_dtype != "auto": + torch_dtype = getattr(torch, torch_dtype) + if torch_dtype != "auto" and not isinstance(torch_dtype, torch.dtype): + raise ValueError( + f"Invalid `torch_dtype` passed to the CPOConfig. Expected a string with either `torch.dtype` or 'auto', but got {torch_dtype}." + ) + model_init_kwargs["torch_dtype"] = torch_dtype + + if isinstance(model, str): + model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs) + + # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16` + # has been called in order to properly call autocast if needed. + self._peft_has_been_casted_to_bf16 = False + + if not is_peft_available() and peft_config is not None: + raise ValueError( + "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models" + ) + elif is_peft_available() and peft_config is not None: + # if model is a peft model and we have a peft_config, we merge and unload it first + if isinstance(model, PeftModel): + model = model.merge_and_unload() + + if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False): + _support_gc_kwargs = hasattr( + args, "gradient_checkpointing_kwargs" + ) and "gradient_checkpointing_kwargs" in list( + inspect.signature(prepare_model_for_kbit_training).parameters + ) + + prepare_model_kwargs = {"use_gradient_checkpointing": args.gradient_checkpointing} + + if _support_gc_kwargs: + prepare_model_kwargs["gradient_checkpointing_kwargs"] = args.gradient_checkpointing_kwargs + + model = prepare_model_for_kbit_training(model, **prepare_model_kwargs) + elif args.gradient_checkpointing: + # For backward compatibility with older versions of transformers + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + else: + + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + # get peft model with the given config + model = model + if args.bf16 and getattr(model, "is_loaded_in_4bit", False): + peft_module_casting_to_bf16(model) + # If args.bf16 we need to explicitly call `generate` with torch amp autocast context manager + self._peft_has_been_casted_to_bf16 = True + + # For models that use gradient_checkpointing, we need to attach a hook that enables input + # to explicitly have `requires_grad=True`, otherwise training will either silently + # fail or completely fail. + elif args.gradient_checkpointing: + # For backward compatibility with older versions of transformers + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + else: + + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + if args.generate_during_eval and not (is_wandb_available() or is_comet_available()): + raise ValueError( + "`generate_during_eval=True` requires Weights and Biases or Comet to be installed." + " Please install `wandb` or `comet-ml` to resolve." + ) + + if model is not None: + self.is_encoder_decoder = model.config.is_encoder_decoder + elif args.is_encoder_decoder is None: + raise ValueError("When no model is provided, you need to pass the parameter is_encoder_decoder.") + else: + self.is_encoder_decoder = args.is_encoder_decoder + + if self.is_encoder_decoder: + self.decoder_start_token_id = model.config.decoder_start_token_id + self.pad_token_id = model.config.pad_token_id + + if processing_class is None: + raise ValueError("processing_class must be specified to tokenize a CPO dataset.") + if args.max_length is None: + warnings.warn( + "`max_length` is not set in the CPOConfig's init" + " it will default to `512` by default, but you should do it yourself in the future.", + UserWarning, + ) + max_length = 512 + else: + max_length = args.max_length + if args.max_prompt_length is None: + warnings.warn( + "`max_prompt_length` is not set in the CPOConfig's init" + " it will default to `128` by default, but you should do it yourself in the future.", + UserWarning, + ) + max_prompt_length = 128 + else: + max_prompt_length = args.max_prompt_length + + if not max_prompt_length < max_length: + raise ValueError( + f"max_prompt_length ({max_prompt_length}) should be strictly less than max_length ({max_length})." + ) + + if args.max_completion_length is None and self.is_encoder_decoder: + warnings.warn( + "When using an encoder decoder architecture, you should set `max_completion_length` in the CPOConfig's init" + " it will default to `128` by default, but you should do it yourself in the future.", + UserWarning, + ) + max_completion_length = 128 + else: + max_completion_length = args.max_completion_length + + if data_collator is None: + data_collator = DPODataCollatorWithPadding( + pad_token_id=processing_class.pad_token_id, + label_pad_token_id=args.label_pad_token_id, + is_encoder_decoder=self.is_encoder_decoder, + ) + + if args.remove_unused_columns: + args.remove_unused_columns = False + # warn users + warnings.warn( + "When using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your TrainingArguments" + " we have set it for you, but you should do it yourself in the future.", + UserWarning, + ) + + self.use_dpo_data_collator = True + else: + self.use_dpo_data_collator = False + + # Disable dropout in the model + if args.disable_dropout: + disable_dropout_in_model(model) + + self.max_length = max_length + self.generate_during_eval = args.generate_during_eval + self.label_pad_token_id = args.label_pad_token_id + self.padding_value = args.padding_value if args.padding_value is not None else processing_class.pad_token_id + self.max_prompt_length = max_prompt_length + self.truncation_mode = args.truncation_mode + self.max_completion_length = max_completion_length + self.processing_class = processing_class + + if args.loss_type in ["hinge", "ipo"] and args.label_smoothing > 0: + warnings.warn( + f"You are using the {args.loss_type} loss type that does not support label smoothing. The " + "`label_smoothing` parameter will be ignored. Set `label_smoothing` to `0.0` to remove this warning.", + UserWarning, + ) + if args.loss_type == "kto_pair": + raise ValueError("Support for kto_pair has been removed in CPOTrainer. Please use KTOTrainer.") + + self.beta = args.beta + self.label_smoothing = args.label_smoothing + self.loss_type = args.loss_type + self.cpo_alpha = args.cpo_alpha + self.aux_loss_enabled = getattr(model.config, "output_router_logits", False) + self.aux_loss_coef = getattr(model.config, "router_aux_loss_coef", 0.0) + if self.aux_loss_enabled and self.aux_loss_coef == 0.0: + warnings.warn( + "You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to " + "`0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value " + "greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary " + "loss.", + UserWarning, + ) + + if args.loss_type == "simpo": + self.simpo_gamma = args.simpo_gamma + + self._stored_metrics = defaultdict(lambda: defaultdict(list)) + + # The trainer estimates the number of FLOPs [floating-point operations] using the number of elements in the + # input tensor associated with the key "input_ids". However, in CPO, the sampled data does not include the + # "input_ids" key. Instead, the available keys are "prompt_input_ids", "chosen_input_ids", and + # "rejected_input_ids". As a result, the trainer issues the warning: "Could not estimate the number of tokens + # of the input, floating-point operations will not be computed." To suppress this warning, we set the + # "estimate_tokens" key in the model's "warnings_issued" dictionary to True. This acts as a flag to indicate + # that the warning has already been issued. + model.warnings_issued["estimate_tokens"] = True + + # Compute that only on the main process for faster data processing. + # see: https://github.com/huggingface/trl/pull/1255 + with PartialState().main_process_first(): + # Extract the prompt if needed, and apply the chat template if needed + train_dataset = train_dataset.map(maybe_extract_prompt, num_proc=args.dataset_num_proc) + train_dataset = train_dataset.map( + maybe_apply_chat_template, fn_kwargs={"tokenizer": processing_class}, num_proc=args.dataset_num_proc + ) + if eval_dataset is not None: + eval_dataset = eval_dataset.map(maybe_extract_prompt, num_proc=args.dataset_num_proc) + eval_dataset = eval_dataset.map( + maybe_apply_chat_template, + fn_kwargs={"tokenizer": processing_class}, + num_proc=args.dataset_num_proc, + ) + + # tokenize the dataset + train_dataset = train_dataset.map(self.tokenize_row, num_proc=args.dataset_num_proc) + if eval_dataset is not None: + eval_dataset = eval_dataset.map(self.tokenize_row, num_proc=args.dataset_num_proc) + + super().__init__( + model=model, + args=args, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + processing_class=processing_class, + model_init=model_init, + compute_metrics=compute_metrics, + callbacks=callbacks, + optimizers=optimizers, + preprocess_logits_for_metrics=preprocess_logits_for_metrics, + ) + + # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the + # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set + # self.model_accepts_loss_kwargs to False to enable scaling. + self.model_accepts_loss_kwargs = False + + # Add tags for models that have been loaded with the correct transformers version + if hasattr(self.model, "add_model_tags"): + self.model.add_model_tags(self._tag_names) + + if not hasattr(self, "accelerator"): + raise AttributeError( + "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`." + ) + + def build_tokenized_answer(self, prompt, answer): + """ + Llama tokenizer does satisfy `enc(a + b) = enc(a) + enc(b)`. It does ensure `enc(a + b) = enc(a) + enc(a + + b)[len(enc(a)):]`. Reference: + https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257 + """ + + full_tokenized = self.processing_class(prompt + answer, add_special_tokens=False) + prompt_input_ids = self.processing_class(prompt, add_special_tokens=False)["input_ids"] + + answer_input_ids = full_tokenized["input_ids"][len(prompt_input_ids) :] + answer_attention_mask = full_tokenized["attention_mask"][len(prompt_input_ids) :] + + # Concat tokens to form `enc(a) + enc(a + b)[len(enc(a)):]` + full_concat_input_ids = np.concatenate([prompt_input_ids, answer_input_ids]) + + # Prepare input tokens for token by token comparison + full_input_ids = np.array(full_tokenized["input_ids"]) + + if len(full_input_ids) != len(full_concat_input_ids): + raise ValueError("Prompt input ids and answer input ids should have the same length.") + + # On some tokenizers, like Llama-2 tokenizer, there are occasions where tokens + # can be merged together when tokenizing prompt+answer. This could result + # on the last token from the prompt being different when tokenized on its own + # vs when done as prompt+answer. + response_token_ids_start_idx = len(prompt_input_ids) + + # If tokenized prompt is different than both prompt+answer, then it means the + # last token has changed due to merging. + if prompt_input_ids != full_tokenized["input_ids"][:response_token_ids_start_idx]: + response_token_ids_start_idx -= 1 + + prompt_input_ids = full_tokenized["input_ids"][:response_token_ids_start_idx] + prompt_attention_mask = full_tokenized["attention_mask"][:response_token_ids_start_idx] + + if len(prompt_input_ids) != len(prompt_attention_mask): + raise ValueError("Prompt input ids and attention mask should have the same length.") + + answer_input_ids = full_tokenized["input_ids"][response_token_ids_start_idx:] + answer_attention_mask = full_tokenized["attention_mask"][response_token_ids_start_idx:] + + return dict( + prompt_input_ids=prompt_input_ids, + prompt_attention_mask=prompt_attention_mask, + input_ids=answer_input_ids, + attention_mask=answer_attention_mask, + ) + + def tokenize_row(self, feature, model: Optional[Union[PreTrainedModel, nn.Module]] = None) -> dict: + """Tokenize a single row from a CPO specific dataset. + + At this stage, we don't convert to PyTorch tensors yet; we just handle the truncation in case the prompt + + chosen or prompt + rejected responses is/are too long. First we truncate the prompt; if we're still too long, + we truncate the chosen/rejected. + + We also create the labels for the chosen/rejected responses, which are of length equal to the sum of the length + of the prompt and the chosen/rejected response, with label_pad_token_id for the prompt tokens. + """ + batch = {} + prompt = feature["prompt"] + chosen = feature["chosen"] + rejected = feature["rejected"] + + if not self.is_encoder_decoder: + # Check issues below for more details + # 1. https://github.com/huggingface/trl/issues/907 + # 2. https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257 + # 3. https://github.com/LianjiaTech/BELLE/issues/337 + + if not isinstance(prompt, str): + raise ValueError(f"prompt should be an str but got {type(prompt)}") + prompt_tokens = self.processing_class(prompt, add_special_tokens=False) + prompt_tokens = {f"prompt_{k}": v for k, v in prompt_tokens.items()} + + if not isinstance(chosen, str): + raise ValueError(f"chosen should be an str but got {type(chosen)}") + chosen_tokens = self.build_tokenized_answer(prompt, chosen) + + if not isinstance(rejected, str): + raise ValueError(f"rejected should be an str but got {type(rejected)}") + rejected_tokens = self.build_tokenized_answer(prompt, rejected) + + # Last prompt token might get merged by tokenizer and + # it should not be included for generation if that happens + prompt_len_input_ids = len(prompt_tokens["prompt_input_ids"]) + + chosen_prompt_len_input_ids = len(chosen_tokens["prompt_input_ids"]) + rejected_prompt_len_input_ids = len(rejected_tokens["prompt_input_ids"]) + prompt_len_input_ids = min(chosen_prompt_len_input_ids, rejected_prompt_len_input_ids) + + for k, v in prompt_tokens.items(): + prompt_tokens[k] = v[:prompt_len_input_ids] + + # Make sure prompts only have one different token at most an + # and length only differs by 1 at most + num_diff_tokens = sum( + [a != b for a, b in zip(chosen_tokens["prompt_input_ids"], rejected_tokens["prompt_input_ids"])] + ) + num_diff_len = abs(chosen_prompt_len_input_ids - rejected_prompt_len_input_ids) + if num_diff_tokens > 1 or num_diff_len > 1: + raise ValueError( + "Chosen and rejected prompt_input_ids might only differ on the " + "last token due to tokenizer merge ops." + ) + + # add BOS token to head of prompt. Avoid adding if it's already there + prompt_tokens, chosen_tokens, rejected_tokens = add_bos_token_if_needed( + self.processing_class.bos_token_id, + prompt_len_input_ids, + prompt_tokens, + chosen_prompt_len_input_ids, + chosen_tokens, + rejected_prompt_len_input_ids, + rejected_tokens, + ) + + # add EOS token to end of answer. Avoid adding if it's already there + chosen_tokens, rejected_tokens = add_eos_token_if_needed( + self.processing_class.eos_token_id, chosen_tokens, rejected_tokens + ) + + longer_response_length = max(len(chosen_tokens["input_ids"]), len(rejected_tokens["input_ids"])) + + # if combined sequence is too long, truncate the prompt + for answer_tokens in [chosen_tokens, rejected_tokens, prompt_tokens]: + if len(answer_tokens["prompt_input_ids"]) + longer_response_length > self.max_length: + if self.truncation_mode == "keep_start": + for k in ["prompt_input_ids", "prompt_attention_mask"]: + answer_tokens[k] = answer_tokens[k][: self.max_prompt_length] + elif self.truncation_mode == "keep_end": + for k in ["prompt_input_ids", "prompt_attention_mask"]: + answer_tokens[k] = answer_tokens[k][-self.max_prompt_length :] + else: + raise ValueError(f"Unknown truncation mode: {self.truncation_mode}") + + # if that's still too long, truncate the response + for answer_tokens in [chosen_tokens, rejected_tokens]: + if len(answer_tokens["prompt_input_ids"]) + longer_response_length > self.max_length: + for k in ["input_ids", "attention_mask"]: + answer_tokens[k] = answer_tokens[k][: self.max_length - self.max_prompt_length] + + # Create labels + chosen_sequence_tokens = { + k: chosen_tokens[f"prompt_{k}"] + chosen_tokens[k] for k in ["input_ids", "attention_mask"] + } + rejected_sequence_tokens = { + k: rejected_tokens[f"prompt_{k}"] + rejected_tokens[k] for k in ["input_ids", "attention_mask"] + } + chosen_sequence_tokens["labels"] = chosen_sequence_tokens["input_ids"][:] + chosen_sequence_tokens["labels"][: len(chosen_tokens["prompt_input_ids"])] = [ + self.label_pad_token_id + ] * len(chosen_tokens["prompt_input_ids"]) + rejected_sequence_tokens["labels"] = rejected_sequence_tokens["input_ids"][:] + rejected_sequence_tokens["labels"][: len(rejected_tokens["prompt_input_ids"])] = [ + self.label_pad_token_id + ] * len(rejected_tokens["prompt_input_ids"]) + + for k, toks in { + "chosen_": chosen_sequence_tokens, + "rejected_": rejected_sequence_tokens, + "": prompt_tokens, + }.items(): + for type_key, tokens in toks.items(): + if type_key == "token_type_ids": + continue + batch[f"{k}{type_key}"] = tokens + + else: + chosen_tokens = self.processing_class( + chosen, truncation=True, max_length=self.max_completion_length, add_special_tokens=True + ) + rejected_tokens = self.processing_class( + rejected, truncation=True, max_length=self.max_completion_length, add_special_tokens=True + ) + prompt_tokens = self.processing_class( + prompt, truncation=True, max_length=self.max_prompt_length, add_special_tokens=True + ) + + batch["chosen_labels"] = chosen_tokens["input_ids"] + batch["rejected_labels"] = rejected_tokens["input_ids"] + batch["prompt_input_ids"] = prompt_tokens["input_ids"] + batch["prompt_attention_mask"] = prompt_tokens["attention_mask"] + + if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"): + batch["rejected_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels( + labels=torch.tensor(batch["rejected_labels"]) + ) + batch["chosen_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels( + labels=torch.tensor(batch["chosen_labels"]) + ) + + return batch + + @staticmethod + def concatenated_inputs( + batch: dict[str, Union[list, torch.LongTensor]], + is_encoder_decoder: bool = False, + label_pad_token_id: int = -100, + padding_value: int = 0, + device: Optional[torch.device] = None, + ) -> dict[str, torch.LongTensor]: + """Concatenate the chosen and rejected inputs into a single tensor. + + Args: + batch: + A batch of data. Must contain the keys 'chosen_input_ids' and 'rejected_input_ids', which are tensors + of shape (batch_size, sequence_length). + is_encoder_decoder: + Whether the model is an encoder-decoder model. + label_pad_token_id: + The label pad token id. + padding_value: + The padding value to use for the concatenated inputs_ids. + device: + The device for the concatenated inputs. + + Returns: + A dictionary containing the concatenated inputs under the key 'concatenated_input_ids'. + """ + concatenated_batch = {} + + if is_encoder_decoder: + max_length = max(batch["chosen_labels"].shape[1], batch["rejected_labels"].shape[1]) + else: + max_length = max(batch["chosen_input_ids"].shape[1], batch["rejected_input_ids"].shape[1]) + + for k in batch: + if k.startswith("chosen") and isinstance(batch[k], torch.Tensor): + if "labels" in k or is_encoder_decoder: + pad_value = label_pad_token_id + elif k.endswith("_input_ids"): + pad_value = padding_value + elif k.endswith("_attention_mask"): + pad_value = 0 + concatenated_key = k.replace("chosen", "concatenated") + concatenated_batch[concatenated_key] = pad_to_length(batch[k], max_length, pad_value=pad_value) + for k in batch: + if k.startswith("rejected") and isinstance(batch[k], torch.Tensor): + if "labels" in k or is_encoder_decoder: + pad_value = label_pad_token_id + elif k.endswith("_input_ids"): + pad_value = padding_value + elif k.endswith("_attention_mask"): + pad_value = 0 + concatenated_key = k.replace("rejected", "concatenated") + concatenated_batch[concatenated_key] = torch.cat( + ( + concatenated_batch[concatenated_key], + pad_to_length(batch[k], max_length, pad_value=pad_value), + ), + dim=0, + ).to(device=device) + + if is_encoder_decoder: + concatenated_batch["concatenated_input_ids"] = batch["prompt_input_ids"].repeat(2, 1).to(device=device) + concatenated_batch["concatenated_attention_mask"] = ( + batch["prompt_attention_mask"].repeat(2, 1).to(device=device) + ) + + return concatenated_batch + + def cpo_loss( + self, + policy_chosen_logps: torch.FloatTensor, + policy_rejected_logps: torch.FloatTensor, + ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: + """Compute the CPO loss for a batch of policy and reference model log probabilities. + + Args: + policy_chosen_logps: + Log probabilities of the policy model for the chosen responses. Shape: (batch_size,) + policy_rejected_logps: + Log probabilities of the policy model for the rejected responses. Shape: (batch_size,) + + Returns: + A tuple of three tensors: (losses, chosen_rewards, rejected_rewards). The losses tensor contains the CPO + loss for each example in the batch. The chosen_rewards and rejected_rewards tensors contain the rewards for + the chosen and rejected responses, respectively. + """ + logits = (policy_chosen_logps - policy_rejected_logps).to(self.accelerator.device) + + # The beta is a temperature parameter for the CPO loss, typically something in the range of 0.1 to 0.5. + # We ignore the reference model as beta -> 0. The label_smoothing parameter encodes our uncertainty about the labels and + # calculates a conservative CPO loss. + + if self.loss_type == "simpo": + gamma_logratios = self.simpo_gamma / self.beta + logits = logits - gamma_logratios + # This reduces to Equation 3 from the CPO paper when label_smoothing -> 0. + losses = ( + -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing) + - F.logsigmoid(-self.beta * logits) * self.label_smoothing + ) + elif self.loss_type == "sigmoid": + # This reduces to Equation 3 from the CPO paper when label_smoothing -> 0. + losses = ( + -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing) + - F.logsigmoid(-self.beta * logits) * self.label_smoothing + ) + elif self.loss_type == "hinge": + losses = torch.relu(1 - self.beta * logits) + elif self.loss_type == "ipo": + # eqn (17) of the paper where beta is the regularization parameter for the IPO loss, denoted by tau in the paper. + losses = (logits - 1 / (2 * self.beta)) ** 2 + else: + raise ValueError( + f"Unknown loss type: {self.loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'simpo']" + ) + + chosen_rewards = self.beta * (policy_chosen_logps.to(self.accelerator.device)).detach() + rejected_rewards = self.beta * (policy_rejected_logps.to(self.accelerator.device)).detach() + + return losses, chosen_rewards, rejected_rewards + + @staticmethod + def get_batch_logps( + logits: torch.FloatTensor, + labels: torch.LongTensor, + average_log_prob: bool = False, + label_pad_token_id: int = -100, + is_encoder_decoder: bool = False, + ) -> torch.FloatTensor: + """Compute the log probabilities of the given labels under the given logits. + + Args: + logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size) + labels: + Labels for which to compute the log probabilities. Label tokens with a value of label_pad_token_id are + ignored. Shape: (batch_size, sequence_length) + average_log_prob: + If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the + log probabilities of the (non-masked) tokens. + label_pad_token_id: The label pad token id. + is_encoder_decoder: Whether the model is an encoder-decoder model. + + Returns: + A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the + given logits. + """ + if logits.shape[:-1] != labels.shape: + raise ValueError("Logits (batch and sequence length dim) and labels must have the same shape.") + + if not is_encoder_decoder: + labels = labels[:, 1:].clone() + logits = logits[:, :-1, :] + loss_mask = labels != label_pad_token_id + + # dummy token; we'll ignore the losses on these tokens later + labels[labels == label_pad_token_id] = 0 + + per_token_logps = selective_log_softmax(logits, labels) + + if average_log_prob: + return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1) + else: + return (per_token_logps * loss_mask).sum(-1) + + def concatenated_forward( + self, model: nn.Module, batch: dict[str, Union[list, torch.LongTensor]] + ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: + """Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together. + + We do this to avoid doing two forward passes, because it's faster for FSDP. + """ + concatenated_batch = self.concatenated_inputs( + batch, + is_encoder_decoder=self.is_encoder_decoder, + label_pad_token_id=self.label_pad_token_id, + padding_value=self.padding_value, + device=self.accelerator.device, + ) + len_chosen = batch["chosen_labels"].shape[0] + + model_kwargs = ( + { + "decoder_input_ids": self._shift_right(concatenated_batch["concatenated_labels"]), + } + if self.is_encoder_decoder + else {} + ) + + if self.aux_loss_enabled: + model_kwargs["output_router_logits"] = True + + outputs = model( + concatenated_batch["concatenated_input_ids"], + attention_mask=concatenated_batch["concatenated_attention_mask"], + use_cache=False, + **model_kwargs, + ) + all_logits = outputs.logits + + def cross_entropy_loss(logits, labels): + if not self.is_encoder_decoder: + # Shift so that tokens < n predict n + logits = logits[..., :-1, :].contiguous() + labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = nn.CrossEntropyLoss() + logits = logits.view(-1, logits.shape[-1]) + labels = labels.view(-1) + # Enable model parallelism + labels = labels.to(logits.device) + loss = loss_fct(logits, labels) + return loss + + labels = concatenated_batch["concatenated_labels"].clone() + + if self.cpo_alpha == 0: + nll_loss = torch.tensor(0.0).to(self.accelerator.device) + else: + nll_loss = cross_entropy_loss(all_logits[:len_chosen], labels[:len_chosen]) + + all_logps = self.get_batch_logps( + all_logits, + concatenated_batch["concatenated_labels"], + average_log_prob=self.loss_type in ["ipo", "simpo"], + is_encoder_decoder=self.is_encoder_decoder, + label_pad_token_id=self.label_pad_token_id, + ) + + chosen_logps = all_logps[:len_chosen] + rejected_logps = all_logps[len_chosen:] + + chosen_logits = all_logits[:len_chosen] + rejected_logits = all_logits[len_chosen:] + + if self.aux_loss_enabled: + return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, nll_loss, outputs.aux_loss) + + return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, nll_loss) + + def get_batch_loss_metrics( + self, + model, + batch: dict[str, Union[list, torch.LongTensor]], + train_eval: Literal["train", "eval"] = "train", + ): + """Compute the CPO loss and other metrics for the given batch of inputs for train or test.""" + metrics = {} + + forward_output = self.concatenated_forward(model, batch) + ( + policy_chosen_logps, + policy_rejected_logps, + policy_chosen_logits, + policy_rejected_logits, + policy_nll_loss, + ) = forward_output[:5] + if self.aux_loss_enabled: + aux_loss = forward_output[5] + + losses, chosen_rewards, rejected_rewards = self.cpo_loss( + policy_chosen_logps, + policy_rejected_logps, + ) + + loss = losses.mean() + self.cpo_alpha * policy_nll_loss + reward_accuracies = (chosen_rewards > rejected_rewards).float() + + prefix = "eval_" if train_eval == "eval" else "" + metrics[f"{prefix}rewards/chosen"] = self.accelerator.gather_for_metrics(chosen_rewards).mean().item() + metrics[f"{prefix}rewards/rejected"] = self.accelerator.gather_for_metrics(rejected_rewards).mean().item() + metrics[f"{prefix}rewards/accuracies"] = self.accelerator.gather_for_metrics(reward_accuracies).mean().item() + metrics[f"{prefix}rewards/margins"] = ( + self.accelerator.gather_for_metrics(chosen_rewards - rejected_rewards).mean().item() + ) + metrics[f"{prefix}logps/rejected"] = ( + self.accelerator.gather_for_metrics(policy_rejected_logps).detach().mean().item() + ) + metrics[f"{prefix}logps/chosen"] = ( + self.accelerator.gather_for_metrics(policy_chosen_logps).detach().mean().item() + ) + metrics[f"{prefix}logits/rejected"] = ( + self.accelerator.gather_for_metrics(policy_rejected_logits.detach().mean()).mean().item() + ) + metrics[f"{prefix}logits/chosen"] = ( + self.accelerator.gather_for_metrics(policy_chosen_logits.detach().mean()).mean().item() + ) + metrics[f"{prefix}nll_loss"] = self.accelerator.gather_for_metrics(policy_nll_loss).detach().mean().item() + + if self.aux_loss_enabled: + loss += self.aux_loss_coef * aux_loss + + return loss, metrics + + def compute_loss( + self, + model: Union[PreTrainedModel, nn.Module], + inputs: dict[str, Union[torch.Tensor, Any]], + return_outputs=False, + num_items_in_batch=None, + ) -> Union[torch.Tensor, tuple[torch.Tensor, dict[str, torch.Tensor]]]: + compute_loss_context_manager = ( + autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext() + ) + + with compute_loss_context_manager: + loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="train") + + # force log the metrics + self.store_metrics(metrics, train_eval="train") + + if return_outputs: + return (loss, metrics) + return loss + + def generate_from_model(self, model, batch: dict[str, torch.LongTensor]) -> str: + """Generate samples from the model and reference model for the given batch of inputs.""" + + # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with + # the torch amp context manager as some hidden states are silently casted to full precision. + generate_context_manager = ( + autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext() + ) + + with generate_context_manager: + policy_output = model.generate( + input_ids=batch["prompt_input_ids"], + attention_mask=batch["prompt_attention_mask"], + max_length=self.max_length, + do_sample=True, + pad_token_id=self.processing_class.pad_token_id, + ) + + policy_output = pad_to_length(policy_output, self.max_length, self.processing_class.pad_token_id) + policy_output_decoded = self.processing_class.batch_decode(policy_output, skip_special_tokens=True) + + return policy_output_decoded + + def prediction_step( + self, + model: Union[PreTrainedModel, nn.Module], + inputs: dict[str, Union[torch.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[list[str]] = None, + ): + if ignore_keys is None: + if hasattr(model, "config"): + ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", []) + else: + ignore_keys = [] + + prediction_context_manager = ( + autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext() + ) + + with torch.no_grad(), prediction_context_manager: + loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="eval") + + # force log the metrics + self.store_metrics(metrics, train_eval="eval") + + if prediction_loss_only: + return (loss.detach(), None, None) + + # logits for the chosen and rejected samples from model + logits_dict = { + "eval_logits/chosen": metrics["eval_logits/chosen"], + "eval_logits/rejected": metrics["eval_logits/rejected"], + } + logits = [v for k, v in logits_dict.items() if k not in ignore_keys] + logits = torch.tensor(logits, device=self.accelerator.device) + labels = torch.zeros(logits.shape[0], device=self.accelerator.device) + + return (loss.detach(), logits, labels) + + def store_metrics(self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None: + for key, value in metrics.items(): + self._stored_metrics[train_eval][key].append(value) + + def evaluation_loop( + self, + dataloader: DataLoader, + description: str, + prediction_loss_only: Optional[bool] = None, + ignore_keys: Optional[list[str]] = None, + metric_key_prefix: str = "eval", + ) -> EvalLoopOutput: + """ + Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by + `Trainer.evaluate()` and `Trainer.predict()`. + + Works both with or without labels. + """ + + # Sample and save to game log if requested (for one batch to save time) + if self.generate_during_eval: + # Generate random indices within the range of the total number of samples + num_samples = len(dataloader.dataset) + random_indices = random.sample(range(num_samples), k=self.args.eval_batch_size) + + # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader + random_batch_dataset = dataloader.dataset.select(random_indices) + random_batch = self.data_collator(random_batch_dataset) + random_batch = self._prepare_inputs(random_batch) + + policy_output_decoded = self.generate_from_model(self.model, random_batch) + + table = pd.DataFrame( + columns=["Prompt", "Policy"], + data=[ + [prompt, pol[len(prompt) :]] for prompt, pol in zip(random_batch["prompt"], policy_output_decoded) + ], + ) + if "wandb" in self.args.report_to: + wandb.log({"game_log": wandb.Table(data=table)}) + + if "comet_ml" in self.args.report_to: + log_table_to_comet_experiment( + name="game_log.csv", + table=table, + ) + + # Base evaluation + initial_output = super().evaluation_loop( + dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix + ) + + return initial_output + + def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None: + """ + Log `logs` on the various objects watching training, including stored metrics. + + Args: + logs (`dict[str, float]`): + The values to log. + start_time (`float` or `None`, *optional*, defaults to `None`): + Start time of the training. + """ + # logs either has 'loss' or 'eval_loss' + train_eval = "train" if "loss" in logs else "eval" + # Add averaged stored metrics to logs + for key, metrics in self._stored_metrics[train_eval].items(): + logs[key] = torch.tensor(metrics).mean().item() + del self._stored_metrics[train_eval] + return super().log(logs, start_time) + + def _shift_right(self, input_ids): + if self.decoder_start_token_id is None: + raise ValueError( + "model.config.decoder_start_token_id has to be defined. It is usually set to the pad_token_id." + ) + + # shift inputs to the right + if is_torch_fx_proxy(input_ids): + # Item assignment is not supported natively for proxies. + shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), self.decoder_start_token_id) + shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1) + else: + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[..., 1:] = input_ids[..., :-1].clone() + shifted_input_ids[..., 0] = self.decoder_start_token_id + + if self.pad_token_id is None: + raise ValueError("model.config.pad_token_id has to be defined.") + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, self.pad_token_id) + + return shifted_input_ids + + # Ensure the model card is saved along with the checkpoint + def _save_checkpoint(self, model, trial): + if self.args.hub_model_id is None: + model_name = Path(self.args.output_dir).name + else: + model_name = self.args.hub_model_id.split("/")[-1] + self.create_model_card(model_name=model_name) + super()._save_checkpoint(model, trial) + + def create_model_card( + self, + model_name: Optional[str] = None, + dataset_name: Optional[str] = None, + tags: Union[str, list[str], None] = None, + ): + """ + Creates a draft of a model card using the information available to the `Trainer`. + + Args: + model_name (`str` or `None`, *optional*, defaults to `None`): + Name of the model. + dataset_name (`str` or `None`, *optional*, defaults to `None`): + Name of the dataset used for training. + tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`): + Tags to be associated with the model card. + """ + if not self.is_world_process_zero(): + return + + if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path): + base_model = self.model.config._name_or_path + else: + base_model = None + + # normalize `tags` to a mutable set + if tags is None: + tags = set() + elif isinstance(tags, str): + tags = {tags} + else: + tags = set(tags) + + if hasattr(self.model.config, "unsloth_version"): + tags.add("unsloth") + + tags.update(self._tag_names) + + citation = textwrap.dedent("""\ + @inproceedings{xu2024contrastive, + title = {{Contrastive Preference Optimization: Pushing the Boundaries of LLM Performance in Machine Translation}}, + author = {Haoran Xu and Amr Sharaf and Yunmo Chen and Weiting Tan and Lingfeng Shen and Benjamin Van Durme and Kenton Murray and Young Jin Kim}, + year = 2024, + booktitle = {Forty-first International Conference on Machine Learning, {ICML} 2024, Vienna, Austria, July 21-27, 2024}, + publisher = {OpenReview.net}, + url = {https://openreview.net/forum?id=51iwkioZpn} + }""") + + model_card = generate_model_card( + base_model=base_model, + model_name=model_name, + hub_model_id=self.hub_model_id, + dataset_name=dataset_name, + tags=tags, + wandb_url=wandb.run.get_url() if is_wandb_available() and wandb.run is not None else None, + comet_url=get_comet_experiment_url(), + trainer_name="CPO", + trainer_citation=citation, + paper_title="Contrastive Preference Optimization: Pushing the Boundaries of LLM Performance in Machine Translation", + paper_id="2401.08417", + ) + model_card.save(os.path.join(self.args.output_dir, "README.md")) +class UnslothCPOTrainer(_UnslothCPOTrainer): + """ + + Initialize CPOTrainer. + + Args: + model (`transformers.PreTrainedModel`): + The model to train, preferably an `AutoModelForSequenceClassification`. + args (`CPOConfig`): + The CPO config arguments to use for training. + data_collator (`transformers.DataCollator`): + The data collator to use for training. If None is specified, the default data collator + (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the + sequences in the batch, given a dataset of paired sequences. + train_dataset (`datasets.Dataset`): + The dataset to use for training. + eval_dataset (`datasets.Dataset`): + The dataset to use for evaluation. + processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*): + Processing class used to process the data. If provided, will be used to automatically process the inputs + for the model, and it will be saved along the model to make it easier to rerun an interrupted training or + reuse the fine-tuned model. + model_init (`Callable[[], transformers.PreTrainedModel]`): + The model initializer to use for training. If None is specified, the default model initializer will be + used. + callbacks (`list[transformers.TrainerCallback]`): + The callbacks to use for training. + optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`): + The optimizer and scheduler to use for training. + preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`): + The function to use to preprocess the logits before computing the metrics. + peft_config (`dict`, defaults to `None`): + The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in + a PEFT model. + compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*): + The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to + metric values. + + """ + def __init__( + self, + model = None, + args = None, + data_collator = None, + train_dataset = None, + eval_dataset = None, + processing_class = None, + model_init = None, + callbacks = None, + preprocess_logits_for_metrics = None, + peft_config = None, + compute_metrics = None, + **kwargs + ): + if args is None: args = UnslothCPOConfig() + use_bf16 = getattr(args, 'bf16', False) + if type(use_bf16) is not bool: use_bf16 = False + use_fp16 = getattr(args, 'fp16', False) + if type(use_fp16) is not bool: use_fp16 = False + force_float32 = False + if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1': + print('Unsloth: Switching to float32 training since model cannot work with float16') + force_float32 = True + mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') + dtype = getattr(model.config, 'torch_dtype', None) + if dtype is None: dtype = model.get_input_embeddings().dtype + from unsloth_zoo.utils import _get_dtype + dtype = _get_dtype(dtype) + float16 = dtype == torch.float16 + if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`') + if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`') + if force_float32: + args.fp16 = False + args.bf16 = False + os.environ['ACCELERATE_MIXED_PRECISION'] = 'no' + elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32': + args.fp16 = float16 + args.bf16 = not float16 + os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16' + if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no': + args.eval_strategy = 'steps' + if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1 + ga_steps = getattr(args, 'gradient_accumulation_steps', None) + if ga_steps is not None and ga_steps > 1: + from transformers import __version__ as transformers_version + if Version(transformers_version) <= Version('4.45.2'): + print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n' + '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`') + if getattr(args, 'eval_strategy', 'no') != 'no': + eval_bsz = getattr(args, 'per_device_eval_batch_size', 8) + if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size + if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps + fp16_full_eval = getattr(args, 'fp16_full_eval', False) + if type(fp16_full_eval) is not bool: fp16_full_eval = False + bf16_full_eval = getattr(args, 'bf16_full_eval', False) + if type(bf16_full_eval) is not bool: bf16_full_eval = False + if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True + if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False + if force_float32: + args.bf16_full_eval = False + args.fp16_full_eval = False + elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16': + args.bf16_full_eval = True + args.fp16_full_eval = False + elif not bf16_full_eval and not fp16_full_eval: + args.bf16_full_eval = args.bf16 + args.fp16_full_eval = args.fp16 + _output_logits = False + if locals().get('compute_metrics', None) is not None: _output_logits = True + if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True + if _output_logits: + os.environ['UNSLOTH_RETURN_LOGITS'] = '1' + if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'): + pass + else: + model_max_seq_length = getattr(model, 'max_seq_length', None) + args_max_seq_length = getattr(args, 'max_seq_length', None) + if args_max_seq_length is None and model_max_seq_length is not None: + max_seq_length = model.max_seq_length + if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length + if model is not None and hasattr(model, 'for_training'): + model.for_training() + if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right' + if 'processing_class' in locals(): + if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right' + if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right' + __tokenizer = processing_class if 'processing_class' in locals() else tokenizer + from unsloth_zoo.vision_utils import UnslothVisionDataCollator + if not isinstance(data_collator, UnslothVisionDataCollator): + if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer, mlm = False, mlm_probability = 0.0) + elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names: + data_collator = DataCollatorForSeq2Seq(__tokenizer) + else: + if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False + if hasattr(args, 'dataset_text_field'): args.dataset_text_field = '' + if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True} + if not isinstance(data_collator, UnslothVisionDataCollator): + if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'): + if isinstance(data_collator, DataCollatorForSeq2Seq): + data_collator = DataCollatorForSeq2Seq(__tokenizer.tokenizer) + else: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer.tokenizer, mlm = False, mlm_probability = 0.0) + other_metrics = [] + + from unsloth_zoo.logging_utils import PatchRLStatistics + PatchRLStatistics('cpo_trainer', other_metrics) + + super().__init__( + model = model, + args = args, + data_collator = data_collator, + train_dataset = train_dataset, + eval_dataset = eval_dataset, + processing_class = processing_class, + model_init = model_init, + callbacks = callbacks, + preprocess_logits_for_metrics = preprocess_logits_for_metrics, + peft_config = peft_config, + compute_metrics = compute_metrics,**kwargs) + if hasattr(self, 'neftune_hook_handle'): + self.neftune_hook_handle.remove() + if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle + if getattr(args, 'neftune_noise_alpha', None) is not None: + model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha + pass + +pass diff --git a/unsloth_compiled_cache/UnslothDDPOTrainer.py b/unsloth_compiled_cache/UnslothDDPOTrainer.py new file mode 100644 index 0000000000000000000000000000000000000000..157fdb1b96e16168590329ed165323f2a1539d26 --- /dev/null +++ b/unsloth_compiled_cache/UnslothDDPOTrainer.py @@ -0,0 +1,889 @@ +""" +2025.7.4 +2025.7.3 +4.53.2 +0.19.1 +__UNSLOTH_VERSIONING__ +""" +from torch import Tensor +import torch +import torch.nn as nn +from torch.nn import functional as F +from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable +from trl.trainer.ddpo_trainer import (Accelerator, Any, Callable, DDPOConfig, DDPOStableDiffusionPipeline, DDPOTrainer, Optional, Path, PerPromptStatTracker, ProjectConfiguration, PyTorchModelHubMixin, Union, defaultdict, futures, generate_model_card, get_comet_experiment_url, is_wandb_available, logger, os, set_seed, textwrap, torch, wandb, warn) + + +import os +from typing import * +from dataclasses import dataclass, field +from packaging.version import Version +import torch +import numpy as np +from contextlib import nullcontext +from torch.nn import functional as F +from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling + +torch_compile_options = { + "epilogue_fusion" : True, + "max_autotune" : False, + "shape_padding" : True, + "trace.enabled" : False, + "triton.cudagraphs" : False, +} + +@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) +def selective_log_softmax(logits, index): + logits = logits.to(torch.float32) + selected_logits = torch.gather(logits, dim = -1, index = index.unsqueeze(-1)).squeeze(-1) + # loop to reduce peak mem consumption + # logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits]) + logsumexp_values = torch.logsumexp(logits, dim = -1) + per_token_logps = selected_logits - logsumexp_values # log_softmax(x_i) = x_i - logsumexp(x) + return per_token_logps +@dataclass +class UnslothDDPOConfig(DDPOConfig): + """ + + Configuration class for the [`DDPOTrainer`]. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + exp_name (`str`, *optional*, defaults to `os.path.basename(sys.argv[0])[: -len(".py")]`): + Name of this experiment (by default is the file name without the extension name). + run_name (`str`, *optional*, defaults to `""`): + Name of this run. + seed (`int`, *optional*, defaults to `0`): + Random seed. + log_with (`Literal["wandb", "tensorboard"]]` or `None`, *optional*, defaults to `None`): + Log with either 'wandb' or 'tensorboard', check + https://huggingface.co/docs/accelerate/usage_guides/tracking for more details. + tracker_kwargs (`Dict`, *optional*, defaults to `{}`): + Keyword arguments for the tracker (e.g. wandb_project). + accelerator_kwargs (`Dict`, *optional*, defaults to `{}`): + Keyword arguments for the accelerator. + project_kwargs (`Dict`, *optional*, defaults to `{}`): + Keyword arguments for the accelerator project config (e.g. `logging_dir`). + tracker_project_name (`str`, *optional*, defaults to `"trl"`): + Name of project to use for tracking. + logdir (`str`, *optional*, defaults to `"logs"`): + Top-level logging directory for checkpoint saving. + num_epochs (`int`, *optional*, defaults to `100`): + Number of epochs to train. + save_freq (`int`, *optional*, defaults to `1`): + Number of epochs between saving model checkpoints. + num_checkpoint_limit (`int`, *optional*, defaults to `5`): + Number of checkpoints to keep before overwriting old ones. + mixed_precision (`str`, *optional*, defaults to `"fp16"`): + Mixed precision training. + allow_tf32 (`bool`, *optional*, defaults to `True`): + Allow `tf32` on Ampere GPUs. + resume_from (`str`, *optional*, defaults to `""`): + Resume training from a checkpoint. + sample_num_steps (`int`, *optional*, defaults to `50`): + Number of sampler inference steps. + sample_eta (`float`, *optional*, defaults to `1.0`): + Eta parameter for the DDIM sampler. + sample_guidance_scale (`float`, *optional*, defaults to `5.0`): + Classifier-free guidance weight. + sample_batch_size (`int`, *optional*, defaults to `1`): + Batch size (per GPU) to use for sampling. + sample_num_batches_per_epoch (`int`, *optional*, defaults to `2`): + Number of batches to sample per epoch. + train_batch_size (`int`, *optional*, defaults to `1`): + Batch size (per GPU) to use for training. + train_use_8bit_adam (`bool`, *optional*, defaults to `False`): + Use 8bit Adam optimizer from bitsandbytes. + train_learning_rate (`float`, *optional*, defaults to `3e-4`): + Learning rate. + train_adam_beta1 (`float`, *optional*, defaults to `0.9`): + Adam beta1. + train_adam_beta2 (`float`, *optional*, defaults to `0.999`): + Adam beta2. + train_adam_weight_decay (`float`, *optional*, defaults to `1e-4`): + Adam weight decay. + train_adam_epsilon (`float`, *optional*, defaults to `1e-8`): + Adam epsilon. + train_gradient_accumulation_steps (`int`, *optional*, defaults to `1`): + Number of gradient accumulation steps. + train_max_grad_norm (`float`, *optional*, defaults to `1.0`): + Maximum gradient norm for gradient clipping. + train_num_inner_epochs (`int`, *optional*, defaults to `1`): + Number of inner epochs per outer epoch. + train_cfg (`bool`, *optional*, defaults to `True`): + Whether to use classifier-free guidance during training. + train_adv_clip_max (`float`, *optional*, defaults to `5.0`): + Clip advantages to the range. + train_clip_range (`float`, *optional*, defaults to `1e-4`): + PPO clip range. + train_timestep_fraction (`float`, *optional*, defaults to `1.0`): + Fraction of timesteps to train on. + per_prompt_stat_tracking (`bool`, *optional*, defaults to `False`): + Whether to track statistics for each prompt separately. + per_prompt_stat_tracking_buffer_size (`int`, *optional*, defaults to `16`): + Number of reward values to store in the buffer for each prompt. + per_prompt_stat_tracking_min_count (`int`, *optional*, defaults to `16`): + Minimum number of reward values to store in the buffer. + async_reward_computation (`bool`, *optional*, defaults to `False`): + Whether to compute rewards asynchronously. + max_workers (`int`, *optional*, defaults to `2`): + Maximum number of workers to use for async reward computation. + negative_prompts (`str`, *optional*, defaults to `""`): + Comma-separated list of prompts to use as negative examples. + push_to_hub (`bool`, *optional*, defaults to `False`): + Whether to push the final model checkpoint to the Hub. + + """ + vllm_sampling_params: Optional[Any] = field( + default = None, + metadata = {'help': 'vLLM SamplingParams'}, + ) + unsloth_num_chunks : Optional[int] = field( + default = -1, + metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'}, + ) + def __init__( + self, + exp_name = 'colab_kernel_launcher', + run_name = '', + seed = 3407, + log_with = None, + tracker_project_name = 'trl', + logdir = 'logs', + num_epochs = 100, + save_freq = 1, + num_checkpoint_limit = 5, + mixed_precision = 'fp16', + allow_tf32 = True, + resume_from = '', + sample_num_steps = 50, + sample_eta = 1.0, + sample_guidance_scale = 5.0, + sample_batch_size = 1, + sample_num_batches_per_epoch = 2, + train_batch_size = 1, + train_use_8bit_adam = False, + train_learning_rate = 5e-05, + train_adam_beta1 = 0.9, + train_adam_beta2 = 0.999, + train_adam_weight_decay = 0.01, + train_adam_epsilon = 1e-08, + train_gradient_accumulation_steps = 2, + train_max_grad_norm = 1.0, + train_num_inner_epochs = 1, + train_cfg = True, + train_adv_clip_max = 5.0, + train_clip_range = 0.0001, + train_timestep_fraction = 1.0, + per_prompt_stat_tracking = False, + per_prompt_stat_tracking_buffer_size = 16, + per_prompt_stat_tracking_min_count = 16, + async_reward_computation = False, + max_workers = 2, + negative_prompts = '', + push_to_hub = False, + vllm_sampling_params = None, + unsloth_num_chunks = -1, + **kwargs, + ): + + super().__init__( + exp_name = exp_name, + run_name = run_name, + seed = seed, + log_with = log_with, + tracker_project_name = tracker_project_name, + logdir = logdir, + num_epochs = num_epochs, + save_freq = save_freq, + num_checkpoint_limit = num_checkpoint_limit, + mixed_precision = mixed_precision, + allow_tf32 = allow_tf32, + resume_from = resume_from, + sample_num_steps = sample_num_steps, + sample_eta = sample_eta, + sample_guidance_scale = sample_guidance_scale, + sample_batch_size = sample_batch_size, + sample_num_batches_per_epoch = sample_num_batches_per_epoch, + train_batch_size = train_batch_size, + train_use_8bit_adam = train_use_8bit_adam, + train_learning_rate = train_learning_rate, + train_adam_beta1 = train_adam_beta1, + train_adam_beta2 = train_adam_beta2, + train_adam_weight_decay = train_adam_weight_decay, + train_adam_epsilon = train_adam_epsilon, + train_gradient_accumulation_steps = train_gradient_accumulation_steps, + train_max_grad_norm = train_max_grad_norm, + train_num_inner_epochs = train_num_inner_epochs, + train_cfg = train_cfg, + train_adv_clip_max = train_adv_clip_max, + train_clip_range = train_clip_range, + train_timestep_fraction = train_timestep_fraction, + per_prompt_stat_tracking = per_prompt_stat_tracking, + per_prompt_stat_tracking_buffer_size = per_prompt_stat_tracking_buffer_size, + per_prompt_stat_tracking_min_count = per_prompt_stat_tracking_min_count, + async_reward_computation = async_reward_computation, + max_workers = max_workers, + negative_prompts = negative_prompts, + push_to_hub = push_to_hub,**kwargs) + self.vllm_sampling_params = vllm_sampling_params + self.unsloth_num_chunks = unsloth_num_chunks +pass + +class _UnslothDDPOTrainer(PyTorchModelHubMixin): + """""" + + _tag_names = ["trl", "ddpo"] + + def __init__( + self, + config: DDPOConfig, + reward_function: Callable[[torch.Tensor, tuple[str], tuple[Any]], torch.Tensor], + prompt_function: Callable[[], tuple[str, Any]], + sd_pipeline: DDPOStableDiffusionPipeline, + image_samples_hook: Optional[Callable[[Any, Any, Any], Any]] = None, + ): + if image_samples_hook is None: + warn("No image_samples_hook provided; no images will be logged") + + self.prompt_fn = prompt_function + self.reward_fn = reward_function + self.config = config + self.image_samples_callback = image_samples_hook + + accelerator_project_config = ProjectConfiguration(**self.config.project_kwargs) + + if self.config.resume_from: + self.config.resume_from = os.path.normpath(os.path.expanduser(self.config.resume_from)) + if "checkpoint_" not in os.path.basename(self.config.resume_from): + # get the most recent checkpoint in this directory + checkpoints = list( + filter( + lambda x: "checkpoint_" in x, + os.listdir(self.config.resume_from), + ) + ) + if len(checkpoints) == 0: + raise ValueError(f"No checkpoints found in {self.config.resume_from}") + checkpoint_numbers = sorted([int(x.split("_")[-1]) for x in checkpoints]) + self.config.resume_from = os.path.join( + self.config.resume_from, + f"checkpoint_{checkpoint_numbers[-1]}", + ) + + accelerator_project_config.iteration = checkpoint_numbers[-1] + 1 + + # number of timesteps within each trajectory to train on + self.num_train_timesteps = int(self.config.sample_num_steps * self.config.train_timestep_fraction) + + self.accelerator = Accelerator( + log_with=self.config.log_with, + mixed_precision=self.config.mixed_precision, + project_config=accelerator_project_config, + # we always accumulate gradients across timesteps; we want config.train.gradient_accumulation_steps to be the + # number of *samples* we accumulate across, so we need to multiply by the number of training timesteps to get + # the total number of optimizer steps to accumulate across. + gradient_accumulation_steps=self.config.train_gradient_accumulation_steps * self.num_train_timesteps, + **self.config.accelerator_kwargs, + ) + + is_okay, message = self._config_check() + if not is_okay: + raise ValueError(message) + + is_using_tensorboard = config.log_with is not None and config.log_with == "tensorboard" + + if self.accelerator.is_main_process: + self.accelerator.init_trackers( + self.config.tracker_project_name, + config=dict(ddpo_trainer_config=config.to_dict()) if not is_using_tensorboard else config.to_dict(), + init_kwargs=self.config.tracker_kwargs, + ) + + logger.info(f"\n{config}") + + set_seed(self.config.seed, device_specific=True) + + self.sd_pipeline = sd_pipeline + + self.sd_pipeline.set_progress_bar_config( + position=1, + disable=not self.accelerator.is_local_main_process, + leave=False, + desc="Timestep", + dynamic_ncols=True, + ) + + # For mixed precision training we cast all non-trainable weights [vae, non-lora text_encoder and non-lora unet] to half-precision + # as these weights are only used for inference, keeping weights in full precision is not required. + if self.accelerator.mixed_precision == "fp16": + inference_dtype = torch.float16 + elif self.accelerator.mixed_precision == "bf16": + inference_dtype = torch.bfloat16 + else: + inference_dtype = torch.float32 + + self.sd_pipeline.vae.to(self.accelerator.device, dtype=inference_dtype) + self.sd_pipeline.text_encoder.to(self.accelerator.device, dtype=inference_dtype) + self.sd_pipeline.unet.to(self.accelerator.device, dtype=inference_dtype) + + trainable_layers = self.sd_pipeline.get_trainable_layers() + + self.accelerator.register_save_state_pre_hook(self._save_model_hook) + self.accelerator.register_load_state_pre_hook(self._load_model_hook) + + # Enable TF32 for faster training on Ampere GPUs, + # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices + if self.config.allow_tf32: + torch.backends.cuda.matmul.allow_tf32 = True + + self.optimizer = self._setup_optimizer( + trainable_layers.parameters() if not isinstance(trainable_layers, list) else trainable_layers + ) + + self.neg_prompt_embed = self.sd_pipeline.text_encoder( + self.sd_pipeline.tokenizer( + [""] if self.config.negative_prompts is None else self.config.negative_prompts, + return_tensors="pt", + padding="max_length", + truncation=True, + max_length=self.sd_pipeline.tokenizer.model_max_length, + ).input_ids.to(self.accelerator.device) + )[0] + + if config.per_prompt_stat_tracking: + self.stat_tracker = PerPromptStatTracker( + config.per_prompt_stat_tracking_buffer_size, + config.per_prompt_stat_tracking_min_count, + ) + + # NOTE: for some reason, autocast is necessary for non-lora training but for lora training it isn't necessary and it uses + # more memory + self.autocast = self.sd_pipeline.autocast or self.accelerator.autocast + + if hasattr(self.sd_pipeline, "use_lora") and self.sd_pipeline.use_lora: + unet, self.optimizer = self.accelerator.prepare(trainable_layers, self.optimizer) + self.trainable_layers = list(filter(lambda p: p.requires_grad, unet.parameters())) + else: + self.trainable_layers, self.optimizer = self.accelerator.prepare(trainable_layers, self.optimizer) + + if self.config.async_reward_computation: + self.executor = futures.ThreadPoolExecutor(max_workers=config.max_workers) + + if config.resume_from: + logger.info(f"Resuming from {config.resume_from}") + self.accelerator.load_state(config.resume_from) + self.first_epoch = int(config.resume_from.split("_")[-1]) + 1 + else: + self.first_epoch = 0 + + def compute_rewards(self, prompt_image_pairs, is_async=False): + if not is_async: + rewards = [] + for images, prompts, prompt_metadata in prompt_image_pairs: + reward, reward_metadata = self.reward_fn(images, prompts, prompt_metadata) + rewards.append( + ( + torch.as_tensor(reward, device=self.accelerator.device), + reward_metadata, + ) + ) + else: + rewards = self.executor.map(lambda x: self.reward_fn(*x), prompt_image_pairs) + rewards = [ + (torch.as_tensor(reward.result(), device=self.accelerator.device), reward_metadata.result()) + for reward, reward_metadata in rewards + ] + + return zip(*rewards) + + def step(self, epoch: int, global_step: int): + """ + Perform a single step of training. + + Args: + epoch (int): The current epoch. + global_step (int): The current global step. + + Side Effects: + - Model weights are updated + - Logs the statistics to the accelerator trackers. + - If `self.image_samples_callback` is not None, it will be called with the prompt_image_pairs, global_step, + and the accelerator tracker. + + Returns: + global_step (int): The updated global step. + + """ + samples, prompt_image_data = self._generate_samples( + iterations=self.config.sample_num_batches_per_epoch, + batch_size=self.config.sample_batch_size, + ) + + # collate samples into dict where each entry has shape (num_batches_per_epoch * sample.batch_size, ...) + samples = {k: torch.cat([s[k] for s in samples]) for k in samples[0].keys()} + rewards, rewards_metadata = self.compute_rewards( + prompt_image_data, is_async=self.config.async_reward_computation + ) + + for i, image_data in enumerate(prompt_image_data): + image_data.extend([rewards[i], rewards_metadata[i]]) + + if self.image_samples_callback is not None: + self.image_samples_callback(prompt_image_data, global_step, self.accelerator.trackers[0]) + + rewards = torch.cat(rewards) + rewards = self.accelerator.gather(rewards).cpu().numpy() + + self.accelerator.log( + { + "reward": rewards, + "epoch": epoch, + "reward_mean": rewards.mean(), + "reward_std": rewards.std(), + }, + step=global_step, + ) + + if self.config.per_prompt_stat_tracking: + # gather the prompts across processes + prompt_ids = self.accelerator.gather(samples["prompt_ids"]).cpu().numpy() + prompts = self.sd_pipeline.tokenizer.batch_decode(prompt_ids, skip_special_tokens=True) + advantages = self.stat_tracker.update(prompts, rewards) + else: + advantages = (rewards - rewards.mean()) / (rewards.std() + 1e-8) + + # ungather advantages; keep the entries corresponding to the samples on this process + samples["advantages"] = ( + torch.as_tensor(advantages) + .reshape(self.accelerator.num_processes, -1)[self.accelerator.process_index] + .to(self.accelerator.device) + ) + + del samples["prompt_ids"] + + total_batch_size, num_timesteps = samples["timesteps"].shape + + for inner_epoch in range(self.config.train_num_inner_epochs): + # shuffle samples along batch dimension + perm = torch.randperm(total_batch_size, device=self.accelerator.device) + samples = {k: v[perm] for k, v in samples.items()} + + # shuffle along time dimension independently for each sample + # still trying to understand the code below + perms = torch.stack( + [torch.randperm(num_timesteps, device=self.accelerator.device) for _ in range(total_batch_size)] + ) + + for key in ["timesteps", "latents", "next_latents", "log_probs"]: + samples[key] = samples[key][ + torch.arange(total_batch_size, device=self.accelerator.device)[:, None], + perms, + ] + + original_keys = samples.keys() + original_values = samples.values() + # rebatch them as user defined train_batch_size is different from sample_batch_size + reshaped_values = [v.reshape(-1, self.config.train_batch_size, *v.shape[1:]) for v in original_values] + + # Transpose the list of original values + transposed_values = zip(*reshaped_values) + # Create new dictionaries for each row of transposed values + samples_batched = [dict(zip(original_keys, row_values)) for row_values in transposed_values] + + self.sd_pipeline.unet.train() + global_step = self._train_batched_samples(inner_epoch, epoch, global_step, samples_batched) + # ensure optimization step at the end of the inner epoch + if not self.accelerator.sync_gradients: + raise ValueError( + "Optimization step should have been performed by this point. Please check calculated gradient accumulation settings." + ) + + if epoch != 0 and epoch % self.config.save_freq == 0 and self.accelerator.is_main_process: + self.accelerator.save_state() + + return global_step + + def calculate_loss(self, latents, timesteps, next_latents, log_probs, advantages, embeds): + """ + Calculate the loss for a batch of an unpacked sample + + Args: + latents (torch.Tensor): + The latents sampled from the diffusion model, shape: [batch_size, num_channels_latents, height, width] + timesteps (torch.Tensor): + The timesteps sampled from the diffusion model, shape: [batch_size] + next_latents (torch.Tensor): + The next latents sampled from the diffusion model, shape: [batch_size, num_channels_latents, height, + width] + log_probs (torch.Tensor): + The log probabilities of the latents, shape: [batch_size] + advantages (torch.Tensor): + The advantages of the latents, shape: [batch_size] + embeds (torch.Tensor): + The embeddings of the prompts, shape: [2*batch_size or batch_size, ...] Note: the "or" is because if + train_cfg is True, the expectation is that negative prompts are concatenated to the embeds + + Returns: + loss (torch.Tensor), approx_kl (torch.Tensor), clipfrac (torch.Tensor) (all of these are of shape (1,)) + """ + with self.autocast(): + if self.config.train_cfg: + noise_pred = self.sd_pipeline.unet( + torch.cat([latents] * 2), + torch.cat([timesteps] * 2), + embeds, + ).sample + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + self.config.sample_guidance_scale * ( + noise_pred_text - noise_pred_uncond + ) + else: + noise_pred = self.sd_pipeline.unet( + latents, + timesteps, + embeds, + ).sample + # compute the log prob of next_latents given latents under the current model + + scheduler_step_output = self.sd_pipeline.scheduler_step( + noise_pred, + timesteps, + latents, + eta=self.config.sample_eta, + prev_sample=next_latents, + ) + + log_prob = scheduler_step_output.log_probs + + advantages = torch.clamp( + advantages, + -self.config.train_adv_clip_max, + self.config.train_adv_clip_max, + ) + + ratio = torch.exp(log_prob - log_probs) + + loss = self.loss(advantages, self.config.train_clip_range, ratio) + + approx_kl = 0.5 * torch.mean((log_prob - log_probs) ** 2) + + clipfrac = torch.mean((torch.abs(ratio - 1.0) > self.config.train_clip_range).float()) + + return loss, approx_kl, clipfrac + + def loss( + self, + advantages: torch.Tensor, + clip_range: float, + ratio: torch.Tensor, + ): + unclipped_loss = -advantages * ratio + clipped_loss = -advantages * torch.clamp( + ratio, + 1.0 - clip_range, + 1.0 + clip_range, + ) + return torch.mean(torch.maximum(unclipped_loss, clipped_loss)) + + def _setup_optimizer(self, trainable_layers_parameters): + if self.config.train_use_8bit_adam: + import bitsandbytes + + optimizer_cls = bitsandbytes.optim.AdamW8bit + else: + optimizer_cls = torch.optim.AdamW + + return optimizer_cls( + trainable_layers_parameters, + lr=self.config.train_learning_rate, + betas=(self.config.train_adam_beta1, self.config.train_adam_beta2), + weight_decay=self.config.train_adam_weight_decay, + eps=self.config.train_adam_epsilon, + ) + + def _save_model_hook(self, models, weights, output_dir): + self.sd_pipeline.save_checkpoint(models, weights, output_dir) + weights.pop() # ensures that accelerate doesn't try to handle saving of the model + + def _load_model_hook(self, models, input_dir): + self.sd_pipeline.load_checkpoint(models, input_dir) + models.pop() # ensures that accelerate doesn't try to handle loading of the model + + def _generate_samples(self, iterations, batch_size): + """ + Generate samples from the model + + Args: + iterations (int): Number of iterations to generate samples for + batch_size (int): Batch size to use for sampling + + Returns: + samples (list[dict[str, torch.Tensor]]), prompt_image_pairs (list[list[Any]]) + """ + samples = [] + prompt_image_pairs = [] + self.sd_pipeline.unet.eval() + + sample_neg_prompt_embeds = self.neg_prompt_embed.repeat(batch_size, 1, 1) + + for _ in range(iterations): + prompts, prompt_metadata = zip(*[self.prompt_fn() for _ in range(batch_size)]) + + prompt_ids = self.sd_pipeline.tokenizer( + prompts, + return_tensors="pt", + padding="max_length", + truncation=True, + max_length=self.sd_pipeline.tokenizer.model_max_length, + ).input_ids.to(self.accelerator.device) + prompt_embeds = self.sd_pipeline.text_encoder(prompt_ids)[0] + + with self.autocast(): + sd_output = self.sd_pipeline( + prompt_embeds=prompt_embeds, + negative_prompt_embeds=sample_neg_prompt_embeds, + num_inference_steps=self.config.sample_num_steps, + guidance_scale=self.config.sample_guidance_scale, + eta=self.config.sample_eta, + output_type="pt", + ) + + images = sd_output.images + latents = sd_output.latents + log_probs = sd_output.log_probs + + latents = torch.stack(latents, dim=1) # (batch_size, num_steps + 1, ...) + log_probs = torch.stack(log_probs, dim=1) # (batch_size, num_steps, 1) + timesteps = self.sd_pipeline.scheduler.timesteps.repeat(batch_size, 1) # (batch_size, num_steps) + + samples.append( + { + "prompt_ids": prompt_ids, + "prompt_embeds": prompt_embeds, + "timesteps": timesteps, + "latents": latents[:, :-1], # each entry is the latent before timestep t + "next_latents": latents[:, 1:], # each entry is the latent after timestep t + "log_probs": log_probs, + "negative_prompt_embeds": sample_neg_prompt_embeds, + } + ) + prompt_image_pairs.append([images, prompts, prompt_metadata]) + + return samples, prompt_image_pairs + + def _train_batched_samples(self, inner_epoch, epoch, global_step, batched_samples): + """ + Train on a batch of samples. Main training segment + + Args: + inner_epoch (int): The current inner epoch + epoch (int): The current epoch + global_step (int): The current global step + batched_samples (list[dict[str, torch.Tensor]]): The batched samples to train on + + Side Effects: + - Model weights are updated + - Logs the statistics to the accelerator trackers. + + Returns: + global_step (int): The updated global step + """ + info = defaultdict(list) + for _i, sample in enumerate(batched_samples): + if self.config.train_cfg: + # concat negative prompts to sample prompts to avoid two forward passes + embeds = torch.cat([sample["negative_prompt_embeds"], sample["prompt_embeds"]]) + else: + embeds = sample["prompt_embeds"] + + for j in range(self.num_train_timesteps): + with self.accelerator.accumulate(self.sd_pipeline.unet): + loss, approx_kl, clipfrac = self.calculate_loss( + sample["latents"][:, j], + sample["timesteps"][:, j], + sample["next_latents"][:, j], + sample["log_probs"][:, j], + sample["advantages"], + embeds, + ) + info["approx_kl"].append(approx_kl) + info["clipfrac"].append(clipfrac) + info["loss"].append(loss) + + self.accelerator.backward(loss) + if self.accelerator.sync_gradients: + self.accelerator.clip_grad_norm_( + self.trainable_layers.parameters() + if not isinstance(self.trainable_layers, list) + else self.trainable_layers, + self.config.train_max_grad_norm, + ) + self.optimizer.step() + self.optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if self.accelerator.sync_gradients: + # log training-related stuff + info = {k: torch.mean(torch.stack(v)) for k, v in info.items()} + info = self.accelerator.reduce(info, reduction="mean") + info.update({"epoch": epoch, "inner_epoch": inner_epoch}) + self.accelerator.log(info, step=global_step) + global_step += 1 + info = defaultdict(list) + return global_step + + def _config_check(self) -> tuple[bool, str]: + samples_per_epoch = ( + self.config.sample_batch_size * self.accelerator.num_processes * self.config.sample_num_batches_per_epoch + ) + total_train_batch_size = ( + self.config.train_batch_size + * self.accelerator.num_processes + * self.config.train_gradient_accumulation_steps + ) + + if not self.config.sample_batch_size >= self.config.train_batch_size: + return ( + False, + f"Sample batch size ({self.config.sample_batch_size}) must be greater than or equal to the train batch size ({self.config.train_batch_size})", + ) + if not self.config.sample_batch_size % self.config.train_batch_size == 0: + return ( + False, + f"Sample batch size ({self.config.sample_batch_size}) must be divisible by the train batch size ({self.config.train_batch_size})", + ) + if not samples_per_epoch % total_train_batch_size == 0: + return ( + False, + f"Number of samples per epoch ({samples_per_epoch}) must be divisible by the total train batch size ({total_train_batch_size})", + ) + return True, "" + + def train(self, epochs: Optional[int] = None): + """ + Train the model for a given number of epochs + """ + global_step = 0 + if epochs is None: + epochs = self.config.num_epochs + for epoch in range(self.first_epoch, epochs): + global_step = self.step(epoch, global_step) + + def _save_pretrained(self, save_directory): + self.sd_pipeline.save_pretrained(save_directory) + self.create_model_card() + + # Ensure the model card is saved along with the checkpoint + def _save_checkpoint(self, model, trial): + if self.args.hub_model_id is None: + model_name = Path(self.args.output_dir).name + else: + model_name = self.args.hub_model_id.split("/")[-1] + self.create_model_card(model_name=model_name) + super()._save_checkpoint(model, trial) + + def create_model_card( + self, + model_name: Optional[str] = None, + dataset_name: Optional[str] = None, + tags: Union[str, list[str], None] = None, + ): + """ + Creates a draft of a model card using the information available to the `Trainer`. + + Args: + model_name (`str` or `None`, *optional*, defaults to `None`): + Name of the model. + dataset_name (`str` or `None`, *optional*, defaults to `None`): + Name of the dataset used for training. + tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`): + Tags to be associated with the model card. + """ + if not self.is_world_process_zero(): + return + + if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path): + base_model = self.model.config._name_or_path + else: + base_model = None + + # normalize `tags` to a mutable set + if tags is None: + tags = set() + elif isinstance(tags, str): + tags = {tags} + else: + tags = set(tags) + + if hasattr(self.model.config, "unsloth_version"): + tags.add("unsloth") + + tags.update(self._tag_names) + + citation = textwrap.dedent("""\ + @inproceedings{black2024training, + title = {{Training Diffusion Models with Reinforcement Learning}}, + author = {Kevin Black and Michael Janner and Yilun Du and Ilya Kostrikov and Sergey Levine}, + year = 2024, + booktitle = {The Twelfth International Conference on Learning Representations, {ICLR} 2024, Vienna, Austria, May 7-11, 2024}, + publisher = {OpenReview.net}, + url = {https://openreview.net/forum?id=YCWjhGrJFD}, + }""") + + model_card = generate_model_card( + base_model=base_model, + model_name=model_name, + hub_model_id=self.hub_model_id, + dataset_name=dataset_name, + tags=tags, + wandb_url=wandb.run.get_url() if is_wandb_available() and wandb.run is not None else None, + comet_url=get_comet_experiment_url(), + trainer_name="DDPO", + trainer_citation=citation, + paper_title="Training Diffusion Models with Reinforcement Learning", + paper_id="2305.13301", + ) + + model_card.save(os.path.join(self.args.output_dir, "README.md")) +class UnslothDDPOTrainer(_UnslothDDPOTrainer): + """ + + The DDPOTrainer uses Deep Diffusion Policy Optimization to optimise diffusion models. Note, this trainer is heavily + inspired by the work here: https://github.com/kvablack/ddpo-pytorch As of now only Stable Diffusion based pipelines + are supported + + Attributes: + **config** (`DDPOConfig`) -- Configuration object for DDPOTrainer. Check the documentation of `PPOConfig` for more: + details. + **reward_function** (Callable[[torch.Tensor, tuple[str], tuple[Any]], torch.Tensor]) -- Reward function to be used: + **prompt_function** (Callable[[], tuple[str, Any]]) -- Function to generate prompts to guide model + **sd_pipeline** (`DDPOStableDiffusionPipeline`) -- Stable Diffusion pipeline to be used for training. + **image_samples_hook** (Optional[Callable[[Any, Any, Any], Any]]) -- Hook to be called to log images + + """ + def __init__( + self, + config, + reward_function, + prompt_function, + sd_pipeline, + image_samples_hook = None, + **kwargs + ): + if args is None: args = UnslothDDPOConfig() + other_metrics = [] + + from unsloth_zoo.logging_utils import PatchRLStatistics + PatchRLStatistics('ddpo_trainer', other_metrics) + + super().__init__( + config = config, + reward_function = reward_function, + prompt_function = prompt_function, + sd_pipeline = sd_pipeline, + image_samples_hook = image_samples_hook,**kwargs) + +pass diff --git a/unsloth_compiled_cache/UnslothDPOTrainer.py b/unsloth_compiled_cache/UnslothDPOTrainer.py new file mode 100644 index 0000000000000000000000000000000000000000..76cb98c64ef062d7910461668200bf1db871cc83 --- /dev/null +++ b/unsloth_compiled_cache/UnslothDPOTrainer.py @@ -0,0 +1,2443 @@ +""" +2025.7.4 +2025.7.3 +4.53.2 +0.19.1 +__UNSLOTH_VERSIONING__ +""" +from torch import Tensor +import torch +import torch.nn as nn +from torch.nn import functional as F +from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable +from trl.trainer.dpo_trainer import (Any, AutoModelForCausalLM, AutoTokenizer, BaseImageProcessor, Callable, DPOConfig, DPOTrainer, DataCollator, DataCollatorForPreference, DataLoader, Dataset, EvalLoopOutput, F, FDivergenceConstants, FDivergenceType, FeatureExtractionMixin, IterableDataset, Literal, MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES, Optional, PartialState, Path, PeftConfig, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, RunningMoments, SyncRefModelCallback, Trainer, TrainerCallback, Union, autocast, cap_exp, contextmanager, create_reference_model, dataclass, defaultdict, disable_dropout_in_model, empty_cache, flush_left, flush_right, generate_model_card, get_comet_experiment_url, get_peft_model, inspect, is_comet_available, is_liger_kernel_available, is_peft_available, is_wandb_available, log_table_to_comet_experiment, maybe_apply_chat_template, maybe_extract_prompt, nn, nullcontext, os, pad, pad_to_length, pd, peft_module_casting_to_bf16, prepare_deepspeed, prepare_fsdp, prepare_model_for_kbit_training, random, shift_tokens_right, textwrap, torch, tqdm, wandb, warnings, F, Optional, PeftModel, PreTrainedModel, Trainer, is_peft_available, os, torch) + + +import os +from typing import * +from dataclasses import dataclass, field +from packaging.version import Version +import torch +import numpy as np +from contextlib import nullcontext +from torch.nn import functional as F +from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling + +torch_compile_options = { + "epilogue_fusion" : True, + "max_autotune" : False, + "shape_padding" : True, + "trace.enabled" : False, + "triton.cudagraphs" : False, +} + +@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) +def selective_log_softmax(logits, index): + logits = logits.to(torch.float32) + selected_logits = torch.gather(logits, dim = -1, index = index.unsqueeze(-1)).squeeze(-1) + # loop to reduce peak mem consumption + # logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits]) + logsumexp_values = torch.logsumexp(logits, dim = -1) + per_token_logps = selected_logits - logsumexp_values # log_softmax(x_i) = x_i - logsumexp(x) + return per_token_logps +@dataclass +class UnslothDPOConfig(DPOConfig): + """ + + Configuration class for the [`DPOTrainer`]. + + This class includes only the parameters that are specific to DPO training. For a full list of training arguments, + please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may + differ from those in [`~transformers.TrainingArguments`]. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + > Parameters that control the model and reference model + + model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): + Keyword arguments for `AutoModelForCausalLM.from_pretrained`, used when the `model` argument of the + [`DPOTrainer`] is provided as a string. + ref_model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): + Keyword arguments for `AutoModelForCausalLM.from_pretrained`, used when the `ref_model` argument of the + [`DPOTrainer`] is provided as a string. + model_adapter_name (`str` or `None`, *optional*, defaults to `None`): + Name of the train target PEFT adapter, when using LoRA with multiple adapters. + ref_adapter_name (`str` or `None`, *optional*, defaults to `None`): + Name of the reference PEFT adapter, when using LoRA with multiple adapters. + force_use_ref_model (`bool`, *optional*, defaults to `False`): + If you provide a PEFT model as the active model and wish to use a different model for the `ref_model`, set + this flag to `True`. + disable_dropout (`bool`, *optional*, defaults to `True`): + Whether to disable dropout in the model and reference model. + use_logits_to_keep (`bool`, *optional*, defaults to `False`): + If `True`, only a specified number of logits are computed in the forward pass. This can be useful for + saving memory and speeding up training by not computing the logits for all tokens, especially in scenarios + when working with very long prompts where labels are ignored (-100). + + > Parameters that control the data preprocessing + + dataset_num_proc (`int` or `None`, *optional*, defaults to `None`): + Number of processes to use for processing the dataset. + padding_value (`int` or `None`, *optional*, defaults to `None`): + Padding value to use. If `None`, the padding value of the tokenizer is used. + label_pad_token_id (`int`, *optional*, defaults to `-100`): + Padding value to use for labels. + max_prompt_length (`int` or `None`, *optional*, defaults to `512`): + Maximum length of the prompt. + max_completion_length (`int` or `None`, *optional*, defaults to `None`): + Maximum length of the completion. + max_length (`int` or `None`, *optional*, defaults to `1024`): + Maximum length of the full sequence (prompt + completion). + truncation_mode (`str`, *optional*, defaults to `"keep_end"`): + Truncation mode to use when the sequence exceeds `max_length`. Possible values are `"keep_end"` and + `"keep_start"`. + padding_free (`bool`, *optional*, defaults to `False`): + Whether to perform forward passes without padding by flattening all sequences in the batch into a single + continuous sequence. This reduces memory usage by eliminating padding overhead. Currently, this is only + supported with the `flash_attention_2` attention implementation, which can efficiently handle the flattened + batch structure. + precompute_ref_log_probs (`bool`, *optional*, defaults to `False`): + Whether to precompute the log probabilities from the reference model. Setting this to `True` allows + training without needing the reference model during training, which can help reduce GPU memory usage. If + set to `False` (default), the reference model will be used during training to compute log probabilities + on-the-fly. + precompute_ref_batch_size (`int` or `None`, *optional*, defaults to `None`): + Batch size to use when precomputing reference model log probabilities. This can be set higher than the + training batch size to speed up preprocessing. If `None`, defaults to `per_device_train_batch_size` for + training and `per_device_eval_batch_size` for evaluation. + tools (`Optional[list[Union[dict, Callable]]]`, *optional*, defaults to `None`): + List of tools (callable functions) that will be accessible to the model. If the template does not support + function calling, this argument will have no effect. + + > Parameters that control the training + + loss_type (`str`, *optional*, defaults to `"sigmoid"`): + Type of loss to use. Possible values are: + + - `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper. + - `"hinge"`: hinge loss on the normalized likelihood from the + [SLiC](https://huggingface.co/papers/2305.10425) paper. + - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper. + - `"exo_pair"`: pairwise EXO loss from the [EXO](https://huggingface.co/papers/2402.00856) paper. + - `"nca_pair"`: pairwise NCA loss from the [NCA](https://huggingface.co/papers/2402.05369) paper. + - `"robust"`: unbiased estimate of the DPO loss that is robust to preference noise from the [Robust + DPO](https://huggingface.co/papers/2403.00409) paper. + - `"bco_pair"`: pairwise BCO loss from the [BCO](https://huggingface.co/papers/2404.04656) paper. + - `"sppo_hard"`: SPPO loss with hard label from the [SPPO](https://huggingface.co/papers/2405.00675) + paper. + - `"aot"`: AOT loss for paired datasets from the [AOT](https://huggingface.co/papers/2406.05882) paper. + - `"aot_pair"`: AOT loss for unpaired datasets from the [AOT](https://huggingface.co/papers/2406.05882) + paper. + - `"discopop"`: DiscoPOP (a.k.a Log-Ratio Modulated Loss, LRML) loss from the + [DiscoPOP](https://huggingface.co/papers/2406.08414) paper. + - `"apo_zero"`: APO-zero loss from the [APO](https://huggingface.co/papers/2408.06266) paper. + - `"apo_down"`: APO-down loss from the [APO](https://huggingface.co/papers/2408.06266) paper. + + use_liger_loss (`bool`, *optional*, defaults to `False`): + Whether to use Liger loss. + base_model_attribute_name (`str`, *optional*, defaults to `"model"`): + Name of the attribute in the model that contains the base model. This is used to get the base model from + the model when the model does not have a `get_decoder` method in the case when `use_liger_loss` is `True`. + beta (`float`, *optional*, defaults to `0.1`): + Parameter controlling the deviation from the reference model. Higher β means less deviation from the + reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in + the [paper](https://huggingface.co/papers/2310.12036). + f_divergence_type (`str`, *optional*, defaults to `FDivergenceType.REVERSE_KL`): + Type of f-divergence regularization function to compute divergence between policy and reference model. + f_alpha_divergence_coef (`float`, *optional*, defaults to `1.0`): + α coefficient in the α-divergence u^-α regularization function for DPO loss. + reference_free (`bool`, *optional*, defaults to `False`): + Whether to ignore the provided reference model and implicitly use a reference model that assigns equal + probability to all responses. + label_smoothing (`float`, *optional*, defaults to `0.0`): + Robust DPO label smoothing parameter from the [cDPO report](https://ericmitchell.ai/cdpo.pdf) and [Robust + DPO](https://huggingface.co/papers/2403.00409) paper that should be between `0.0` and `0.5`. + use_weighting (`bool`, *optional*, defaults to `False`): + Whether to weight the loss as done in the [WPO paper](https://huggingface.co/papers/2406.11827). + rpo_alpha (`float`, *optional*, defaults to `None`): + α parameter from the [RPO paper](https://huggingface.co/papers/2404.19733) (v3), which controls the + weighting of the NLL term in the loss. If `None`, no weighting is applied and the loss is the same as the + DPO loss. The paper recommends `rpo_alpha=1.0`. + ld_alpha (`float` or `None`, *optional*, defaults to `None`): + α parameter from the [LD-DPO paper](https://huggingface.co/papers/2409.06411), which controls the weighting + of the verbose token log-probabilities in responses. If `None`, no weighting is applied to the verbose + part, and the loss is equivalent to the standard DPO loss. The paper recommends setting `ld_alpha` between + `0.0` and `1.0`. + discopop_tau (`float`, *optional*, defaults to `0.05`): + τ/temperature parameter from the [DiscoPOP](https://huggingface.co/papers/2406.08414) paper, which controls + the shape of log ratio modulated loss. The paper recommends the default value `discopop_tau=0.05`. + sync_ref_model (`bool`, *optional*, defaults to `False`): + Whether to synchronize the reference model with the active model every `ref_model_sync_steps` steps, using + the `ref_model_mixup_alpha` parameter. This synchronization originites from the + [TR-DPO](https://huggingface.co/papers/2404.09656) paper. + ref_model_mixup_alpha (`float`, *optional*, defaults to `0.6`): + α parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which controls the mix + between the current policy and the previous reference policy during updates. The reference policy is + updated according to the equation: `π_ref = α * π_θ + (1 - α) * π_ref_prev`. To use this parameter, you + must set `sync_ref_model=True`. + ref_model_sync_steps (`int`, *optional*, defaults to `512`): + τ parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which determines how + frequently the current policy is synchronized with the reference policy. To use this parameter, you must + set `sync_ref_model=True`. + + > Parameters that control the logging + + generate_during_eval (`bool`, *optional*, defaults to `False`): + Whether to generate and log completions from both the model and the reference model to W&B or Comet during + evaluation. + + """ + vllm_sampling_params: Optional[Any] = field( + default = None, + metadata = {'help': 'vLLM SamplingParams'}, + ) + unsloth_num_chunks : Optional[int] = field( + default = -1, + metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'}, + ) + def __init__( + self, + output_dir = None, + overwrite_output_dir = None, + do_train = False, + do_eval = False, + do_predict = False, + eval_strategy = 'no', + prediction_loss_only = False, + per_device_train_batch_size = 4, + per_device_eval_batch_size = 4, + per_gpu_train_batch_size = None, + per_gpu_eval_batch_size = None, + gradient_accumulation_steps = 2, + eval_accumulation_steps = 2, + eval_delay = 0, + torch_empty_cache_steps = 250, + learning_rate = 5e-05, + weight_decay = 0.01, + adam_beta1 = 0.9, + adam_beta2 = 0.999, + adam_epsilon = 1e-08, + max_grad_norm = 1.0, + num_train_epochs = 3.0, + max_steps = -1, + lr_scheduler_type = 'linear', + warmup_ratio = 0.1, + warmup_steps = 0, + log_level = 'passive', + log_level_replica = 'warning', + log_on_each_node = True, + logging_dir = None, + logging_strategy = 'steps', + logging_first_step = False, + logging_steps = 1, + logging_nan_inf_filter = False, + save_strategy = 'steps', + save_steps = 500, + save_total_limit = None, + save_safetensors = True, + save_on_each_node = False, + save_only_model = False, + restore_callback_states_from_checkpoint = False, + no_cuda = False, + use_cpu = False, + use_mps_device = False, + seed = 3407, + data_seed = 3407, + jit_mode_eval = False, + use_ipex = False, + bf16 = False, + fp16 = False, + fp16_opt_level = 'O1', + half_precision_backend = 'auto', + bf16_full_eval = False, + fp16_full_eval = False, + tf32 = None, + local_rank = -1, + ddp_backend = None, + tpu_num_cores = None, + tpu_metrics_debug = False, + debug = '', + dataloader_drop_last = False, + eval_steps = None, + dataloader_num_workers = 0, + dataloader_prefetch_factor = None, + past_index = -1, + run_name = None, + disable_tqdm = None, + remove_unused_columns = True, + label_names = None, + load_best_model_at_end = False, + metric_for_best_model = None, + greater_is_better = None, + ignore_data_skip = False, + fsdp = '', + fsdp_min_num_params = 0, + fsdp_config = None, + fsdp_transformer_layer_cls_to_wrap = None, + accelerator_config = None, + deepspeed = None, + label_smoothing_factor = 0.0, + optim = 'adamw_8bit', + optim_args = None, + adafactor = False, + group_by_length = False, + length_column_name = 'length', + report_to = None, + ddp_find_unused_parameters = None, + ddp_bucket_cap_mb = None, + ddp_broadcast_buffers = None, + dataloader_pin_memory = True, + dataloader_persistent_workers = False, + skip_memory_metrics = True, + use_legacy_prediction_loop = False, + push_to_hub = False, + resume_from_checkpoint = None, + hub_model_id = None, + hub_strategy = 'every_save', + hub_token = None, + hub_private_repo = None, + hub_always_push = False, + hub_revision = None, + gradient_checkpointing = False, + gradient_checkpointing_kwargs = None, + include_inputs_for_metrics = False, + eval_do_concat_batches = True, + fp16_backend = 'auto', + push_to_hub_model_id = None, + push_to_hub_organization = None, + push_to_hub_token = None, + mp_parameters = '', + auto_find_batch_size = False, + full_determinism = False, + torchdynamo = None, + ray_scope = 'last', + ddp_timeout = 1800, + torch_compile = False, + torch_compile_backend = None, + torch_compile_mode = None, + include_tokens_per_second = False, + include_num_input_tokens_seen = False, + neftune_noise_alpha = None, + optim_target_modules = None, + batch_eval_metrics = False, + eval_on_start = False, + use_liger_kernel = False, + liger_kernel_config = None, + eval_use_gather_object = False, + average_tokens_across_devices = False, + model_init_kwargs = None, + ref_model_init_kwargs = None, + model_adapter_name = None, + ref_adapter_name = None, + force_use_ref_model = False, + disable_dropout = True, + use_logits_to_keep = False, + dataset_num_proc = None, + padding_value = None, + label_pad_token_id = -100, + max_prompt_length = 512, + max_completion_length = None, + max_length = 1024, + truncation_mode = 'keep_end', + padding_free = False, + precompute_ref_log_probs = False, + precompute_ref_batch_size = None, + tools = None, + loss_type = 'sigmoid', + use_liger_loss = False, + base_model_attribute_name = 'model', + beta = 0.1, + f_alpha_divergence_coef = 1.0, + reference_free = False, + label_smoothing = 0.0, + use_weighting = False, + rpo_alpha = None, + ld_alpha = None, + discopop_tau = 0.05, + sync_ref_model = False, + ref_model_mixup_alpha = 0.6, + ref_model_sync_steps = 512, + generate_during_eval = False, + vllm_sampling_params = None, + unsloth_num_chunks = -1, + **kwargs, + ): + if learning_rate < 1e-7: raise FloatingPointError(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!') + if learning_rate > 1: raise OverflowError(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!') + if output_dir is None and save_strategy == 'steps' and save_steps == 500: + output_dir = 'unsloth_training_checkpoints' + save_strategy = 'no' + if dataset_num_proc is None: + from multiprocessing import cpu_count + dataset_num_proc = cpu_count() + + super().__init__( + output_dir = output_dir, + overwrite_output_dir = overwrite_output_dir, + do_train = do_train, + do_eval = do_eval, + do_predict = do_predict, + eval_strategy = eval_strategy, + prediction_loss_only = prediction_loss_only, + per_device_train_batch_size = per_device_train_batch_size, + per_device_eval_batch_size = per_device_eval_batch_size, + per_gpu_train_batch_size = per_gpu_train_batch_size, + per_gpu_eval_batch_size = per_gpu_eval_batch_size, + gradient_accumulation_steps = gradient_accumulation_steps, + eval_accumulation_steps = eval_accumulation_steps, + eval_delay = eval_delay, + torch_empty_cache_steps = torch_empty_cache_steps, + learning_rate = learning_rate, + weight_decay = weight_decay, + adam_beta1 = adam_beta1, + adam_beta2 = adam_beta2, + adam_epsilon = adam_epsilon, + max_grad_norm = max_grad_norm, + num_train_epochs = num_train_epochs, + max_steps = max_steps, + lr_scheduler_type = lr_scheduler_type, + warmup_ratio = warmup_ratio, + warmup_steps = warmup_steps, + log_level = log_level, + log_level_replica = log_level_replica, + log_on_each_node = log_on_each_node, + logging_dir = logging_dir, + logging_strategy = logging_strategy, + logging_first_step = logging_first_step, + logging_steps = logging_steps, + logging_nan_inf_filter = logging_nan_inf_filter, + save_strategy = save_strategy, + save_steps = save_steps, + save_total_limit = save_total_limit, + save_safetensors = save_safetensors, + save_on_each_node = save_on_each_node, + save_only_model = save_only_model, + restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint, + no_cuda = no_cuda, + use_cpu = use_cpu, + use_mps_device = use_mps_device, + seed = seed, + data_seed = data_seed, + jit_mode_eval = jit_mode_eval, + use_ipex = use_ipex, + bf16 = bf16, + fp16 = fp16, + fp16_opt_level = fp16_opt_level, + half_precision_backend = half_precision_backend, + bf16_full_eval = bf16_full_eval, + fp16_full_eval = fp16_full_eval, + tf32 = tf32, + local_rank = local_rank, + ddp_backend = ddp_backend, + tpu_num_cores = tpu_num_cores, + tpu_metrics_debug = tpu_metrics_debug, + debug = debug, + dataloader_drop_last = dataloader_drop_last, + eval_steps = eval_steps, + dataloader_num_workers = dataloader_num_workers, + dataloader_prefetch_factor = dataloader_prefetch_factor, + past_index = past_index, + run_name = run_name, + disable_tqdm = disable_tqdm, + remove_unused_columns = remove_unused_columns, + label_names = label_names, + load_best_model_at_end = load_best_model_at_end, + metric_for_best_model = metric_for_best_model, + greater_is_better = greater_is_better, + ignore_data_skip = ignore_data_skip, + fsdp = fsdp, + fsdp_min_num_params = fsdp_min_num_params, + fsdp_config = fsdp_config, + fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap, + accelerator_config = accelerator_config, + deepspeed = deepspeed, + label_smoothing_factor = label_smoothing_factor, + optim = optim, + optim_args = optim_args, + adafactor = adafactor, + group_by_length = group_by_length, + length_column_name = length_column_name, + report_to = report_to, + ddp_find_unused_parameters = ddp_find_unused_parameters, + ddp_bucket_cap_mb = ddp_bucket_cap_mb, + ddp_broadcast_buffers = ddp_broadcast_buffers, + dataloader_pin_memory = dataloader_pin_memory, + dataloader_persistent_workers = dataloader_persistent_workers, + skip_memory_metrics = skip_memory_metrics, + use_legacy_prediction_loop = use_legacy_prediction_loop, + push_to_hub = push_to_hub, + resume_from_checkpoint = resume_from_checkpoint, + hub_model_id = hub_model_id, + hub_strategy = hub_strategy, + hub_token = hub_token, + hub_private_repo = hub_private_repo, + hub_always_push = hub_always_push, + hub_revision = hub_revision, + gradient_checkpointing = gradient_checkpointing, + gradient_checkpointing_kwargs = gradient_checkpointing_kwargs, + include_inputs_for_metrics = include_inputs_for_metrics, + eval_do_concat_batches = eval_do_concat_batches, + fp16_backend = fp16_backend, + push_to_hub_model_id = push_to_hub_model_id, + push_to_hub_organization = push_to_hub_organization, + push_to_hub_token = push_to_hub_token, + mp_parameters = mp_parameters, + auto_find_batch_size = auto_find_batch_size, + full_determinism = full_determinism, + torchdynamo = torchdynamo, + ray_scope = ray_scope, + ddp_timeout = ddp_timeout, + torch_compile = torch_compile, + torch_compile_backend = torch_compile_backend, + torch_compile_mode = torch_compile_mode, + include_tokens_per_second = include_tokens_per_second, + include_num_input_tokens_seen = include_num_input_tokens_seen, + neftune_noise_alpha = neftune_noise_alpha, + optim_target_modules = optim_target_modules, + batch_eval_metrics = batch_eval_metrics, + eval_on_start = eval_on_start, + use_liger_kernel = use_liger_kernel, + liger_kernel_config = liger_kernel_config, + eval_use_gather_object = eval_use_gather_object, + average_tokens_across_devices = average_tokens_across_devices, + model_init_kwargs = model_init_kwargs, + ref_model_init_kwargs = ref_model_init_kwargs, + model_adapter_name = model_adapter_name, + ref_adapter_name = ref_adapter_name, + force_use_ref_model = force_use_ref_model, + disable_dropout = disable_dropout, + use_logits_to_keep = use_logits_to_keep, + dataset_num_proc = dataset_num_proc, + padding_value = padding_value, + label_pad_token_id = label_pad_token_id, + max_prompt_length = max_prompt_length, + max_completion_length = max_completion_length, + max_length = max_length, + truncation_mode = truncation_mode, + padding_free = padding_free, + precompute_ref_log_probs = precompute_ref_log_probs, + precompute_ref_batch_size = precompute_ref_batch_size, + tools = tools, + loss_type = loss_type, + use_liger_loss = use_liger_loss, + base_model_attribute_name = base_model_attribute_name, + beta = beta, + f_alpha_divergence_coef = f_alpha_divergence_coef, + reference_free = reference_free, + label_smoothing = label_smoothing, + use_weighting = use_weighting, + rpo_alpha = rpo_alpha, + ld_alpha = ld_alpha, + discopop_tau = discopop_tau, + sync_ref_model = sync_ref_model, + ref_model_mixup_alpha = ref_model_mixup_alpha, + ref_model_sync_steps = ref_model_sync_steps, + generate_during_eval = generate_during_eval,**kwargs) + self.vllm_sampling_params = vllm_sampling_params + self.unsloth_num_chunks = unsloth_num_chunks +pass + +class _UnslothDPOTrainer(Trainer): + """""" + + _tag_names = ["trl", "dpo"] + + def __init__( + self, + model: Union[str, nn.Module, PreTrainedModel], + ref_model: Optional[Union[PreTrainedModel, nn.Module, str]] = None, + args: Optional[DPOConfig] = None, + data_collator: Optional[DataCollator] = None, # type: ignore + train_dataset: Optional[Union[Dataset, IterableDataset]] = None, + eval_dataset: Optional[Union[Dataset, IterableDataset, dict[str, Union[Dataset, IterableDataset]]]] = None, + processing_class: Optional[ + Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin] + ] = None, + compute_metrics: Optional[Callable[[EvalLoopOutput], dict]] = None, + callbacks: Optional[list[TrainerCallback]] = None, + optimizers: tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None), + optimizer_cls_and_kwargs: Optional[tuple[type[torch.optim.Optimizer], dict[str, Any]]] = None, + preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, + peft_config: Optional["PeftConfig"] = None, + ): + # Args + model_id = model if isinstance(model, str) else model.config._name_or_path + if args is None: + model_name = model_id.split("/")[-1] + args = DPOConfig(f"{model_name}-DPO") + + # Handle the tokenizer + if processing_class is None: + processing_class = AutoTokenizer.from_pretrained(model_id) + + if args.padding_value is not None: + self.padding_value = args.padding_value + else: + if hasattr(processing_class, "pad_token_id") and processing_class.pad_token_id is not None: + self.padding_value = processing_class.pad_token_id + elif hasattr(processing_class, "tokenizer") and processing_class.tokenizer.pad_token_id is not None: + self.padding_value = processing_class.tokenizer.pad_token_id + else: + raise ValueError( + "`padding_value` is not specified in `DPOConfig`, and `pad_token_id` is missing in the " + "`processing_class`. Please either set the `padding_value` argument in `DPOConfig`, or set " + "`tokenizer.pad_token` (e.g., `tokenizer.pad_token = tokenizer.eos_token`) before instantiating " + "the trainer." + ) + + # Model + if not isinstance(model, str) and ref_model is model: + raise ValueError( + "`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the " + "same as `model`, you must mass a copy of it, or `None` if you use peft." + ) + + if args.model_init_kwargs is not None and not isinstance(model, str): + warnings.warn( + "You passed model_init_kwargs to the `DPOConfig`, but your model is already instantiated. " + "The `model_init_kwargs` will be ignored." + ) + if isinstance(model, str): + model = self._create_model_from_path(model, args) + + if args.ref_model_init_kwargs is not None and not isinstance(ref_model, str): + warnings.warn( + "You passed ref_model_init_kwargs to the `DPOConfig`, but your ref_model is already instantiated. " + "The `ref_model_init_kwargs` will be ignored." + ) + if isinstance(ref_model, str): + ref_model = self._create_model_from_path(ref_model, args, is_ref=True) + + # PEFT configuration and model wrapping + model = self._prepare_peft_model(model, ref_model, peft_config, args) + + if args.generate_during_eval and not (is_wandb_available() or is_comet_available()): + raise ValueError( + "`generate_during_eval=True` requires Weights and Biases or Comet to be installed." + " Please install `wandb` or `comet-ml` to resolve." + ) + + self.is_encoder_decoder = model.config.is_encoder_decoder + self.is_vision_model = model.config.model_type in MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES.keys() + self.is_peft_model = is_peft_available() and isinstance(model, PeftModel) + self.model_adapter_name = args.model_adapter_name + self.ref_adapter_name = args.ref_adapter_name + self.reference_free = args.reference_free + + if ref_model: + self.ref_model = ref_model + elif self.is_peft_model or args.precompute_ref_log_probs: + # The `model` with adapters turned off will be used as the reference model + self.ref_model = None + else: + self.ref_model = create_reference_model(model) + + # Disable dropout in the model and reference model + if args.disable_dropout: + disable_dropout_in_model(model) + if self.ref_model is not None: + disable_dropout_in_model(self.ref_model) + + # Liger kernel + if args.use_liger_loss: + if not is_liger_kernel_available(): + raise ImportError( + "You set `use_liger_loss=True` but the liger kernel is not available. " + "Please install liger-kernel first: `pip install liger-kernel`" + ) + if args.loss_type != "sigmoid": + raise ValueError( + "You set `use_liger_loss=True` but the loss type is not `sigmoid`. " + "Please set `loss_type='sigmoid'` to use the liger kernel." + ) + self.dpo_loss_fn = LigerFusedLinearDPOLoss( + ignore_index=args.label_pad_token_id, + beta=args.beta, + use_ref_model=not args.reference_free, + average_log_prob=False, + ) + # The trainer estimates the number of FLOPs [floating-point operations] using the number of elements in the + # input tensor associated with the key "input_ids". However, in DPO, the sampled data does not include the + # "input_ids" key. Instead, the available keys are "prompt_input_ids", "chosen_input_ids", and + # "rejected_input_ids". As a result, the trainer issues the warning: "Could not estimate the number of tokens + # of the input, floating-point operations will not be computed." To suppress this warning, we set the + # "estimate_tokens" key in the model's "warnings_issued" dictionary to True. This acts as a flag to indicate + # that the warning has already been issued. + model.warnings_issued["estimate_tokens"] = True + + # Data collator + if data_collator is None: + data_collator = DataCollatorForPreference(pad_token_id=self.padding_value) + + self.generate_during_eval = args.generate_during_eval + self.label_pad_token_id = args.label_pad_token_id + self.max_prompt_length = args.max_prompt_length + self.max_completion_length = args.max_completion_length + self.max_length = args.max_length + self.truncation_mode = args.truncation_mode + self.precompute_ref_log_probs = args.precompute_ref_log_probs + self.use_logits_to_keep = args.use_logits_to_keep + + if args.padding_free: + if model.config._attn_implementation != "flash_attention_2": + warnings.warn( + "Padding-free training is enabled, but the attention implementation is not set to " + "'flash_attention_2'. Padding-free training flattens batches into a single sequence, and " + "'flash_attention_2' is the only known attention mechanism that reliably supports this. Using " + "other implementations may lead to unexpected behavior. To ensure compatibility, set " + "`attn_implementation='flash_attention_2'` in the model configuration, or verify that your " + "attention mechanism can handle flattened sequences." + ) + if args.per_device_train_batch_size == 1: + warnings.warn( + "You are using a per_device_train_batch_size of 1 with padding-free training. Using a batch size " + "of 1 anihilate the benefits of padding-free training. Please consider increasing the batch size " + "to at least 2." + ) + self.padding_free = args.padding_free + + # Since ref_logs are precomputed on the first call to get_train/eval_dataloader + # keep track of first called to avoid computation of future calls + self._precomputed_train_ref_log_probs = False + self._precomputed_eval_ref_log_probs = False + + if ( + args.loss_type in ["hinge", "ipo", "bco_pair", "sppo_hard", "nca_pair", "apo_zero", "apo_down"] + and args.label_smoothing > 0 + ): + warnings.warn( + f"You are using the {args.loss_type} loss type that does not support label smoothing. The " + "`label_smoothing` parameter will be ignored. Set `label_smoothing` to `0.0` to remove this warning.", + UserWarning, + ) + if args.loss_type == "kto_pair": + raise ValueError("Support for kto_pair has been removed in DPOTrainer. Please use KTOTrainer.") + + self.beta = args.beta + self.label_smoothing = args.label_smoothing + self.loss_type = args.loss_type + self.aux_loss_enabled = getattr(model.config, "output_router_logits", False) + self.use_weighting = args.use_weighting + self.aux_loss_coef = getattr(model.config, "router_aux_loss_coef", 0.0) + if self.aux_loss_enabled and self.aux_loss_coef == 0.0: + warnings.warn( + "You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to " + "`0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value " + "greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary " + "loss.", + UserWarning, + ) + + self._stored_metrics = defaultdict(lambda: defaultdict(list)) + self.f_divergence_type = args.f_divergence_type + self.f_divergence_params = {FDivergenceConstants.ALPHA_DIVERGENCE_COEF_KEY: args.f_alpha_divergence_coef} + self.dataset_num_proc = args.dataset_num_proc + + # Dataset preparation + train_dataset = self._prepare_dataset(train_dataset, processing_class, args, "train") + if eval_dataset is not None: + if isinstance(eval_dataset, dict): + eval_dataset = { + key: self._prepare_dataset(dataset, processing_class, args, key) + for key, dataset in eval_dataset.items() + } + else: + eval_dataset = self._prepare_dataset(eval_dataset, processing_class, args, "eval") + + super().__init__( + model=model, + args=args, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + processing_class=processing_class, + compute_metrics=compute_metrics, + callbacks=callbacks, + optimizers=optimizers, + optimizer_cls_and_kwargs=optimizer_cls_and_kwargs, + preprocess_logits_for_metrics=preprocess_logits_for_metrics, + ) + + # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the + # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set + # self.model_accepts_loss_kwargs to False to enable scaling. + self.model_accepts_loss_kwargs = False + + # Add tags for models that have been loaded with the correct transformers version + if hasattr(self.model, "add_model_tags"): + self.model.add_model_tags(self._tag_names) + + if not hasattr(self, "accelerator"): + raise AttributeError( + "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`." + ) + + # Deepspeed Zero-3 does not support precompute_ref_log_probs + if self.is_deepspeed_enabled: + if self.accelerator.state.deepspeed_plugin.zero_stage == 3 and self.precompute_ref_log_probs: + raise ValueError( + "You cannot use `precompute_ref_log_probs=True` with Deepspeed ZeRO-3. Please set `precompute_ref_log_probs=False`." + ) + + if self.ref_model is None: + if not (self.is_peft_model or self.precompute_ref_log_probs): + raise ValueError( + "No reference model and model is not a Peft model. Try setting `precompute_ref_log_probs=True`" + ) + if args.sync_ref_model: + raise ValueError( + "You currently cannot use `ref_model=None` with TR-DPO method. Please provide `ref_model`." + ) + else: + if self.is_deepspeed_enabled: + self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator) + elif self.is_fsdp_enabled: + self.ref_model = prepare_fsdp(self.ref_model, self.accelerator) + else: + self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True) + + if args.sync_ref_model: + if self.precompute_ref_log_probs: + raise ValueError( + "You cannot use `precompute_ref_log_probs=True` with TR-DPO method. Please set `precompute_ref_log_probs=False`." + ) + + self.add_callback(SyncRefModelCallback(ref_model=self.ref_model, accelerator=self.accelerator)) + + if self.loss_type == "bco_pair": + self.running = RunningMoments(self.accelerator) + + def _create_model_from_path(self, model_path: str, args: DPOConfig, is_ref: bool = False) -> PreTrainedModel: + """Creates a model from a path or model identifier.""" + if not is_ref: + model_init_kwargs = args.model_init_kwargs or {} + else: + model_init_kwargs = args.ref_model_init_kwargs or {} + + # Handle torch dtype + torch_dtype = model_init_kwargs.get("torch_dtype") + if isinstance(torch_dtype, torch.dtype) or torch_dtype == "auto" or torch_dtype is None: + pass # torch_dtype is already a torch.dtype or "auto" or None + elif isinstance(torch_dtype, str): # it's a str, but not "auto" + torch_dtype = getattr(torch, torch_dtype) + model_init_kwargs["torch_dtype"] = torch_dtype + else: + raise ValueError( + "Invalid `torch_dtype` passed to `DPOConfig`. Expected either 'auto' or a string representing " + f"a `torch.dtype` (e.g., 'float32'), but got {torch_dtype}." + ) + # Disable caching if gradient checkpointing is enabled (not supported) + # if args.gradient_checkpointing: + # model_init_kwargs["use_cache"] = False + + # Create model + model = AutoModelForCausalLM.from_pretrained(model_path, **model_init_kwargs) + return model + + def _prepare_peft_model( + self, model: PreTrainedModel, ref_model: PreTrainedModel, peft_config: Any, args: DPOConfig + ) -> PreTrainedModel: + """Prepares a model for PEFT training.""" + # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16` + # has been called in order to properly call autocast if needed. + self._peft_has_been_casted_to_bf16 = False + + if not is_peft_available() and peft_config is not None: + raise ValueError( + "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models" + ) + elif is_peft_available() and peft_config is not None: + # if model is a peft model and we have a peft_config, we merge and unload it first + if isinstance(model, PeftModel): + model = model.merge_and_unload() + + if ref_model is not None and not args.force_use_ref_model: + raise ValueError( + "You passed both a ref_model and a peft_config. For training PEFT adapters with DPO there is no need to pass a reference" + " model. Please pass `ref_model=None` in case you want to train PEFT adapters, or pass a ref_model with `force_use_ref_model=True` in DPOTrainer's init." + " if you want to use a different ref_model." + ) + + if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False): + _support_gc_kwargs = hasattr( + args, "gradient_checkpointing_kwargs" + ) and "gradient_checkpointing_kwargs" in list( + inspect.signature(prepare_model_for_kbit_training).parameters + ) + + prepare_model_kwargs = {"use_gradient_checkpointing": args.gradient_checkpointing} + + if _support_gc_kwargs: + prepare_model_kwargs["gradient_checkpointing_kwargs"] = args.gradient_checkpointing_kwargs + + model = prepare_model_for_kbit_training(model, **prepare_model_kwargs) + + else: + model = self._prepare_gradient_checkpointing(model, args) + + # get peft model with the given config + model = get_peft_model(model, peft_config) + if args.bf16 and getattr(model, "is_loaded_in_4bit", False): + peft_module_casting_to_bf16(model) + # If args.bf16 we need to explicitly call `generate` with torch amp autocast context manager + self._peft_has_been_casted_to_bf16 = True + + else: + model = self._prepare_gradient_checkpointing(model, args) + + return model + + def _prepare_gradient_checkpointing(self, model: PreTrainedModel, args: DPOConfig): + """Prepare the gradienting checkpointing for the model.""" + # For models that use gradient_checkpointing, we need to attach a hook that enables input + # to explicitly have `requires_grad=True`, otherwise training will either silently + # fail or completely fail. + if args.gradient_checkpointing: + # For backward compatibility with older versions of transformers + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + else: + + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + return model + + def _prepare_dataset( + self, + dataset: Union[Dataset, IterableDataset], + processing_class: Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin], + args: DPOConfig, + dataset_name: str, + ) -> Union[Dataset, IterableDataset]: + # Build the kwargs for the `map` function + map_kwargs = {} + if isinstance(dataset, Dataset): # IterableDataset does not support num_proc nor writer_batch_size + map_kwargs["num_proc"] = args.dataset_num_proc + map_kwargs["writer_batch_size"] = 10 + + with PartialState().main_process_first(): + # Extract prompt if needed + if isinstance(dataset, Dataset): # `IterableDataset.map` does not support `desc` + map_kwargs["desc"] = f"Extracting prompt in {dataset_name} dataset" + dataset = dataset.map(maybe_extract_prompt, **map_kwargs) + + # Apply the chat template if needed + if isinstance(dataset, Dataset): # `IterableDataset.map` does not support `desc` + map_kwargs["desc"] = f"Applying chat template to {dataset_name} dataset" + dataset = dataset.map( + maybe_apply_chat_template, fn_kwargs={"tokenizer": processing_class, "tools": args.tools}, **map_kwargs + ) + + # Tokenize the dataset + if isinstance(dataset, Dataset): # `IterableDataset.map` does not support `desc` + map_kwargs["desc"] = f"Tokenizing {dataset_name} dataset" + + dataset = dataset.map( + self.tokenize_row if not self.is_vision_model else self.process_row, + remove_columns=["chosen", "rejected"], + fn_kwargs={ + "processing_class": processing_class, + "max_prompt_length": args.max_prompt_length, + "max_completion_length": args.max_completion_length, + # for enc-dec, we add the special tokens ([bos_token] + prompt + [eos_token]; completion + [eos_token]) + "add_special_tokens": False, + }, + **map_kwargs, + ) + + return dataset + + @staticmethod + def tokenize_row(features, processing_class, max_prompt_length, max_completion_length, add_special_tokens): + """ + Tokenize a row of the dataset. + + Args: + features (`dict[str, str]`): + Row of the dataset, should contain the keys `"prompt"`, `"chosen"`, and `"rejected"`. + processing_class (`PreTrainedTokenizerBase`): + Processing class used to process the data. + max_prompt_length (`int` or `None`): + Maximum length of the prompt sequence. If `None`, the prompt sequence is not truncated. + max_completion_length (`int` or `None`): + Maximum length of the completion sequences. If `None`, the completion sequences are not truncated. + add_special_tokens (`bool`): + Whether to add special tokens to the sequences. Typically used for encoder-decoder models. If `True`, + the prompt sequence will have a bos token prepended and an eos token appended. In any case, the + completion sequences will have an eos token appended. + + Returns: + `dict[str, list[int]]`: + Tokenized sequences with the keys `"prompt_input_ids"`, `"chosen_input_ids"`, and + `"rejected_input_ids". + + Example: + ```python + >>> from transformers import GPT2Tokenizer + + >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2") + >>> features = {"prompt": "The sky is", "chosen": " blue", "rejected": " green"} + >>> DPOTrainer.tokenize_row( + ... features, tokenizer, max_prompt_length=3, max_completion_length=3, add_special_tokens=False + ... ) + {'prompt_input_ids': [464, 6766, 318], 'chosen_input_ids': [4171, 50256], 'rejected_input_ids': [4077, 50256]} + ``` + """ + tokenizer = processing_class # the processing class is a tokenizer + prompt_input_ids = tokenizer(features["prompt"], add_special_tokens=False)["input_ids"] + chosen_input_ids = tokenizer(features["chosen"], add_special_tokens=False)["input_ids"] + rejected_input_ids = tokenizer(features["rejected"], add_special_tokens=False)["input_ids"] + + # Add special tokens (typically for encoder-decoder models) + if add_special_tokens: + if tokenizer.bos_token_id is not None: + prompt_input_ids = [tokenizer.bos_token_id] + prompt_input_ids + if tokenizer.eos_token_id is not None: + prompt_input_ids = prompt_input_ids + [tokenizer.eos_token_id] + chosen_input_ids = chosen_input_ids + [tokenizer.eos_token_id] + rejected_input_ids = rejected_input_ids + [tokenizer.eos_token_id] + + # Truncate prompt and completion sequences + if max_prompt_length is not None: + prompt_input_ids = prompt_input_ids[-max_prompt_length:] + if max_completion_length is not None: + chosen_input_ids = chosen_input_ids[:max_completion_length] + rejected_input_ids = rejected_input_ids[:max_completion_length] + + return { + "prompt_input_ids": prompt_input_ids, + "chosen_input_ids": chosen_input_ids, + "rejected_input_ids": rejected_input_ids, + } + + @staticmethod + def process_row(features, processing_class, max_prompt_length, max_completion_length, add_special_tokens): + """ + Same as `tokenize_row` but for vision models. Please refer to `tokenize_row` for more information. + """ + processor, tokenizer = processing_class, processing_class.tokenizer # the processing class is a processor + processed_features = processor(images=features["images"], text=features["prompt"], add_special_tokens=False) + + prompt_input_ids = processed_features["input_ids"][0] + pixel_values = processed_features["pixel_values"][0] + chosen_input_ids = tokenizer(features["chosen"], add_special_tokens=False)["input_ids"] + rejected_input_ids = tokenizer(features["rejected"], add_special_tokens=False)["input_ids"] + + # Add special tokens (typically for encoder-decoder models) + if add_special_tokens: + if tokenizer.bos_token_id is not None: + prompt_input_ids = [tokenizer.bos_token_id] + prompt_input_ids + if tokenizer.eos_token_id is not None: + prompt_input_ids = prompt_input_ids + [tokenizer.eos_token_id] + chosen_input_ids = chosen_input_ids + [tokenizer.eos_token_id] + rejected_input_ids = rejected_input_ids + [tokenizer.eos_token_id] + + # Truncate prompt and completion sequences + if max_prompt_length is not None: + prompt_input_ids = prompt_input_ids[-max_prompt_length:] + if max_completion_length is not None: + chosen_input_ids = chosen_input_ids[:max_completion_length] + rejected_input_ids = rejected_input_ids[:max_completion_length] + + output = { + "prompt_input_ids": prompt_input_ids, + "pixel_values": pixel_values, + "chosen_input_ids": chosen_input_ids, + "rejected_input_ids": rejected_input_ids, + } + + if "pixel_attention_mask" in processed_features: + output["pixel_attention_mask"] = processed_features["pixel_attention_mask"][0] + if "image_sizes" in processed_features: + output["image_sizes"] = processed_features["image_sizes"][0] + + return output + + def _set_signature_columns_if_needed(self): + # If `self.args.remove_unused_columns` is True, non-signature columns are removed. + # By default, this method sets `self._signature_columns` to the model's expected inputs. + # In DPOTrainer, we preprocess data, so using the model's signature columns doesn't work. + # Instead, we set them to the columns expected by `DataCollatorForPreference`, hence the override. + if self._signature_columns is None: + self._signature_columns = [ + "prompt_input_ids", + "chosen_input_ids", + "rejected_input_ids", + "image_sizes", + "ref_chosen_logps", + "ref_rejected_logps", + ] + + def get_train_dataloader(self) -> DataLoader: + """ + Returns the training [`~torch.utils.data.DataLoader`]. + + Subclass of transformers.src.transformers.trainer.get_train_dataloader to precompute `ref_log_probs`. + """ + + if self.precompute_ref_log_probs and not self._precomputed_train_ref_log_probs: + batch_size = self.args.precompute_ref_batch_size or self.args.per_device_train_batch_size + dataloader_params = { + "batch_size": batch_size, + "collate_fn": self.data_collator, + "num_workers": self.args.dataloader_num_workers, + "pin_memory": self.args.dataloader_pin_memory, + "shuffle": False, + } + + # prepare dataloader + data_loader = self.accelerator.prepare(DataLoader(self.train_dataset, **dataloader_params)) + + ref_chosen_logps = [] + ref_rejected_logps = [] + for padded_batch in tqdm(iterable=data_loader, desc="Train dataset reference log probs"): + ref_chosen_logp, ref_rejected_logp = self.compute_ref_log_probs(padded_batch) + ref_chosen_logp, ref_rejected_logp = self.accelerator.gather_for_metrics( + (ref_chosen_logp, ref_rejected_logp) + ) + ref_chosen_logps.append(ref_chosen_logp.cpu()) + ref_rejected_logps.append(ref_rejected_logp.cpu()) + + # Unnecessary cache clearing to avoid OOM + empty_cache() + self.accelerator.free_memory() + + all_ref_chosen_logps = torch.cat(ref_chosen_logps).float().numpy() + all_ref_rejected_logps = torch.cat(ref_rejected_logps).float().numpy() + + self.train_dataset = self.train_dataset.add_column(name="ref_chosen_logps", column=all_ref_chosen_logps) + self.train_dataset = self.train_dataset.add_column( + name="ref_rejected_logps", column=all_ref_rejected_logps + ) + + self._precomputed_train_ref_log_probs = True + + return super().get_train_dataloader() + + def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader: + """ + Returns the evaluation [`~torch.utils.data.DataLoader`]. + + Subclass of transformers.src.transformers.trainer.get_eval_dataloader to precompute `ref_log_probs`. + + Args: + eval_dataset (`torch.utils.data.Dataset`, *optional*): + If provided, will override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns not accepted + by the `model.forward()` method are automatically removed. It must implement `__len__`. + """ + if eval_dataset is None and self.eval_dataset is None: + raise ValueError("Trainer: evaluation requires an eval_dataset.") + eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset + + if self.precompute_ref_log_probs and not self._precomputed_eval_ref_log_probs: + batch_size = self.args.precompute_ref_batch_size or self.args.per_device_eval_batch_size + dataloader_params = { + "batch_size": batch_size, + "collate_fn": self.data_collator, + "num_workers": self.args.dataloader_num_workers, + "pin_memory": self.args.dataloader_pin_memory, + "shuffle": False, + } + + # prepare dataloader + data_loader = self.accelerator.prepare(DataLoader(eval_dataset, **dataloader_params)) + + ref_chosen_logps = [] + ref_rejected_logps = [] + for padded_batch in tqdm(iterable=data_loader, desc="Eval dataset reference log probs"): + ref_chosen_logp, ref_rejected_logp = self.compute_ref_log_probs(padded_batch) + ref_chosen_logp, ref_rejected_logp = self.accelerator.gather_for_metrics( + (ref_chosen_logp, ref_rejected_logp) + ) + ref_chosen_logps.append(ref_chosen_logp.cpu()) + ref_rejected_logps.append(ref_rejected_logp.cpu()) + + all_ref_chosen_logps = torch.cat(ref_chosen_logps).float().numpy() + all_ref_rejected_logps = torch.cat(ref_rejected_logps).float().numpy() + + eval_dataset = eval_dataset.add_column(name="ref_chosen_logps", column=all_ref_chosen_logps) + eval_dataset = eval_dataset.add_column(name="ref_rejected_logps", column=all_ref_rejected_logps) + + # Save calculated ref_chosen_logps and ref_rejected_logps to the eval_dataset for subsequent runs + if self.eval_dataset is not None: + self.eval_dataset = eval_dataset + self._precomputed_eval_ref_log_probs = True + + return super().get_eval_dataloader(eval_dataset=eval_dataset) + + @contextmanager + def null_ref_context(self): + """Context manager for handling null reference model (that is, peft adapter manipulation).""" + with ( + self.accelerator.unwrap_model(self.model).disable_adapter() + if self.is_peft_model and not self.ref_adapter_name + else nullcontext() + ): + if self.ref_adapter_name: + self.model.set_adapter(self.ref_adapter_name) + yield + if self.ref_adapter_name: + self.model.set_adapter(self.model_adapter_name or "default") + + def compute_ref_log_probs(self, batch: dict[str, torch.LongTensor]) -> dict: + """Computes log probabilities of the reference model for a single padded batch of a DPO specific dataset.""" + compte_ref_context_manager = ( + autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext() + ) + with torch.no_grad(), compte_ref_context_manager: + if self.ref_model is None: + with self.null_ref_context(): + ref_model_output = self.concatenated_forward(self.model, batch, is_ref_model=True) + else: + ref_model_output = self.concatenated_forward(self.ref_model, batch, is_ref_model=True) + return ref_model_output["chosen_logps"], ref_model_output["rejected_logps"] + + @staticmethod + def concatenated_inputs( + batch: dict[str, Union[list, torch.LongTensor]], padding_value: int + ) -> dict[str, torch.LongTensor]: + """ + Concatenate the `chosen` and `rejected` inputs from the batch into a single tensor for both the prompt and + completion sequences. + + Args: + batch (`dict[str, Union[list, torch.LongTensor]]`): + A batch of input data. The batch must contain the following keys: + + - `"prompt_input_ids"`: Tensor of shape `(batch_size, prompt_length)` representing the prompt input + IDs. + - `"chosen_input_ids"`: Tensor of shape `(batch_size, chosen_length)` representing the chosen + completion input IDs. + - `"rejected_input_ids"`: Tensor of shape `(batch_size, rejected_length)` representing the rejected + completion input IDs. + - `"prompt_pixel_values"` (optional): Tensor for pixel values, if available. + - `"prompt_pixel_attention_mask"` (optional): Tensor for pixel attention masks, if available. + + padding_value (`int`): + The padding value to use for the concatenated completion sequences (`chosen_input_ids` and + `rejected_input_ids`). + + Returns: + `dict[str, torch.LongTensor]`: A dictionary containing: + + - `"prompt_input_ids"`: Concatenated prompt input IDs of shape `(2 * batch_size, prompt_length)`. + - `"completion_input_ids"`: Concatenated chosen and rejected completion input IDs of shape `(2 * + batch_size, max_completion_length)`. + - `"prompt_attention_mask"`: Concatenated prompt attention masks of shape `(2 * batch_size, + prompt_length)`. + - `"completion_attention_mask"`: Concatenated chosen and rejected attention masks of shape `(2 * + batch_size, max_completion_length)`. + - `"pixel_values"` (optional): Concatenated pixel values if `"prompt_pixel_values"` are present. + - `"pixel_attention_mask"` (optional): Concatenated pixel attention masks if + `"prompt_pixel_attention_mask"` are present. + + Notes: + The completion input IDs and attention masks are padded to the maximum completion length of the chosen or + rejected sequences. + """ + output = {} + + # For the prompt, the input_ids are the same for both the chosen and rejected responses + output["prompt_input_ids"] = torch.cat([batch["prompt_input_ids"], batch["prompt_input_ids"]], dim=0) + output["prompt_attention_mask"] = torch.cat( + [batch["prompt_attention_mask"], batch["prompt_attention_mask"]], dim=0 + ) + if "pixel_values" in batch: + output["pixel_values"] = torch.cat([batch["pixel_values"], batch["pixel_values"]], dim=0) + + if "pixel_attention_mask" in batch: + output["pixel_attention_mask"] = torch.cat( + [batch["pixel_attention_mask"], batch["pixel_attention_mask"]], dim=0 + ) + if "image_sizes" in batch: + output["image_sizes"] = torch.cat([batch["image_sizes"], batch["image_sizes"]], dim=0) + + # Concatenate the chosen and rejected completions + max_completion_length = max(batch["chosen_input_ids"].shape[1], batch["rejected_input_ids"].shape[1]) + output["completion_input_ids"] = torch.cat( + ( + pad_to_length(batch["chosen_input_ids"], max_completion_length, pad_value=padding_value), + pad_to_length(batch["rejected_input_ids"], max_completion_length, pad_value=padding_value), + ), + ) + output["completion_attention_mask"] = torch.cat( + ( + pad_to_length(batch["chosen_attention_mask"], max_completion_length, pad_value=0), + pad_to_length(batch["rejected_attention_mask"], max_completion_length, pad_value=0), + ), + ) + + return output + + def dpo_loss( + self, + chosen_logps: torch.FloatTensor, + rejected_logps: torch.FloatTensor, + ref_chosen_logps: torch.FloatTensor, + ref_rejected_logps: torch.FloatTensor, + ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: + """ + Compute the DPO loss for a batch of policy and reference model log probabilities. + + Args: + chosen_logps (`torch.FloatTensor`): + Log probabilities of the model for the chosen responses. Shape: `(batch_size,)`. + rejected_logps (`torch.FloatTensor`): + Log probabilities of the model for the rejected responses. Shape: `(batch_size,)`. + ref_chosen_logps (`torch.FloatTensor`): + Log probabilities of the reference model for the chosen responses. Shape: `(batch_size,)`. + ref_rejected_logps (`torch.FloatTensor`): + Log probabilities of the reference model for the rejected responses. Shape: `(batch_size,)`. + + Returns: + A tuple of three tensors: `(losses, chosen_rewards, rejected_rewards)`. The losses tensor contains the DPO + loss for each example in the batch. The `chosen_rewards` and `rejected_rewards` tensors contain the rewards + for the chosen and rejected responses, respectively. + """ + device = self.accelerator.device + + # Get the log ratios for the chosen and rejected responses + chosen_logratios = chosen_logps.to(device) - (not self.reference_free) * ref_chosen_logps.to(device) + rejected_logratios = rejected_logps.to(device) - (not self.reference_free) * ref_rejected_logps.to(device) + + if self.f_divergence_type == FDivergenceType.ALPHA_DIVERGENCE.value: + # The alpha-divergence formula: (1 - u^-alpha) / alpha + # The divergence difference between the chosen and rejected sample is: + # (1 - u[w]^-alpha) / alpha - (1 - u[l]^-alpha) / alpha + # = (u[l]^-alpha - u[w]^-alpha) / alpha + # where u[w] and u[l] are the policy/reference probability ratios + # for the chosen and rejected samples, respectively. + alpha_coef = FDivergenceConstants.ALPHA_DIVERGENCE_COEF_DEFAULT + if self.f_divergence_params and FDivergenceConstants.ALPHA_DIVERGENCE_COEF_KEY in self.f_divergence_params: + alpha_coef = float(self.f_divergence_params[FDivergenceConstants.ALPHA_DIVERGENCE_COEF_KEY]) + logits = (cap_exp(rejected_logratios * -alpha_coef) - cap_exp(chosen_logratios * -alpha_coef)) / alpha_coef + else: + logratios = chosen_logps - rejected_logps + if self.reference_free: + ref_logratios = torch.tensor([0], dtype=logratios.dtype, device=logratios.device) + else: + ref_logratios = ref_chosen_logps - ref_rejected_logps + + logratios = logratios.to(self.accelerator.device) + ref_logratios = ref_logratios.to(self.accelerator.device) + logits = logratios - ref_logratios + + if self.f_divergence_type == FDivergenceType.JS_DIVERGENCE.value: + # The js-divergence formula: log(2 * u / (1 + u)) + # The divergence difference between the chosen and rejected sample is: + # log(2 * u[w] / (1 + u[w])) - log(2 * u[l] / (1 + u[l])) + # = log(u[w]) - log(u[l]) - (log(1 + u[w]) - log(1 + u[l])) + # where u[w] and u[l] are the policy/reference probability ratios + # for the chosen and rejected samples, respectively. + logits -= F.softplus(chosen_logratios) - F.softplus(rejected_logratios) + + # The beta is a temperature parameter for the DPO loss, typically something in the range of 0.1 to 0.5. + # We ignore the reference model as beta -> 0. The label_smoothing parameter encodes our uncertainty about the + # labels and calculates a conservative DPO loss. + if self.loss_type == "sigmoid": + losses = ( + -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing) + - F.logsigmoid(-self.beta * logits) * self.label_smoothing + ) + + elif self.loss_type == "robust": + losses = ( + -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing) + + F.logsigmoid(-self.beta * logits) * self.label_smoothing + ) / (1 - 2 * self.label_smoothing) + + elif self.loss_type == "exo_pair": + # eqn (16) of the EXO paper: https://huggingface.co/papers/2402.00856 + import math + + if self.label_smoothing == 0: + self.label_smoothing = 1e-3 + losses = (self.beta * logits).sigmoid() * ( + F.logsigmoid(self.beta * logits) - math.log(1 - self.label_smoothing) + ) + (-self.beta * logits).sigmoid() * (F.logsigmoid(-self.beta * logits) - math.log(self.label_smoothing)) + + elif self.loss_type == "hinge": + losses = torch.relu(1 - self.beta * logits) + + elif self.loss_type == "ipo": + # eqn (17) of the paper where beta is the regularization parameter for the IPO loss, denoted by tau in the paper. + losses = (logits - 1 / (2 * self.beta)) ** 2 + + elif self.loss_type == "bco_pair": + chosen_logratios = chosen_logps - ref_chosen_logps + rejected_logratios = rejected_logps - ref_rejected_logps + chosen_rewards = self.beta * chosen_logratios + rejected_rewards = self.beta * rejected_logratios + rewards = torch.cat((chosen_rewards, rejected_rewards), 0).mean().detach() + self.running.update(rewards) + delta = self.running.mean + losses = -F.logsigmoid((self.beta * chosen_logratios) - delta) - F.logsigmoid( + -(self.beta * rejected_logratios - delta) + ) + + elif self.loss_type == "sppo_hard": + # In the paper (https://huggingface.co/papers/2405.00675), SPPO employs a soft probability approach, + # estimated using the PairRM score. The probability calculation is conducted outside of the trainer class. + # The version described here is the hard probability version, where P in Equation (4.7) of Algorithm 1 is + # set to 1 for the winner and 0 for the loser. + a = chosen_logps - ref_chosen_logps + b = rejected_logps - ref_rejected_logps + losses = (a - 0.5 / self.beta) ** 2 + (b + 0.5 / self.beta) ** 2 + + elif self.loss_type == "nca_pair": + chosen_rewards = (chosen_logps - ref_chosen_logps) * self.beta + rejected_rewards = (rejected_logps - ref_rejected_logps) * self.beta + losses = ( + -F.logsigmoid(chosen_rewards) + - 0.5 * F.logsigmoid(-chosen_rewards) + - 0.5 * F.logsigmoid(-rejected_rewards) + ) + + elif self.loss_type == "aot_pair": + chosen_logratios = chosen_logps - ref_chosen_logps + rejected_logratios = rejected_logps - ref_rejected_logps + chosen_logratios_sorted, _ = torch.sort(chosen_logratios, dim=0) + rejected_logratios_sorted, _ = torch.sort(rejected_logratios, dim=0) + delta = chosen_logratios_sorted - rejected_logratios_sorted + losses = ( + -F.logsigmoid(self.beta * delta) * (1 - self.label_smoothing) + - F.logsigmoid(-self.beta * delta) * self.label_smoothing + ) + + elif self.loss_type == "aot": + logratios = chosen_logps - rejected_logps + ref_logratios = ref_chosen_logps - ref_rejected_logps + logratios_sorted, _ = torch.sort(logratios, dim=0) + ref_logratios_sorted, _ = torch.sort(ref_logratios, dim=0) + delta = logratios_sorted - ref_logratios_sorted + losses = ( + -F.logsigmoid(self.beta * delta) * (1 - self.label_smoothing) + - F.logsigmoid(-self.beta * delta) * self.label_smoothing + ) + + elif self.loss_type == "apo_zero": + # Eqn (7) of the APO paper (https://huggingface.co/papers/2408.06266) + # Use this loss when you believe the chosen outputs are better than your model's default output + losses_chosen = 1 - F.sigmoid(self.beta * chosen_logratios) # Increase chosen likelihood + losses_rejected = F.sigmoid(self.beta * rejected_logratios) # Decrease rejected likelihood + losses = losses_chosen + losses_rejected + + elif self.loss_type == "apo_down": + # Eqn (8) of the APO paper (https://huggingface.co/papers/2408.06266) + # Use this loss when you believe the chosen outputs are worse than your model's default output. + # Decrease chosen likelihood and decrease rejected likelihood more + losses_chosen = F.sigmoid(self.beta * chosen_logratios) + losses_rejected = 1 - F.sigmoid(self.beta * (chosen_logratios - rejected_logratios)) + losses = losses_chosen + losses_rejected + + elif self.loss_type == "discopop": + # Eqn (5) of the DiscoPOP paper (https://huggingface.co/papers/2406.08414) + # This loss was discovered with LLM discovery + logratios = chosen_logps - rejected_logps + ref_logratios = ref_chosen_logps - ref_rejected_logps + logits = logratios - ref_logratios + logits = logits * self.beta + # Modulate the mixing coefficient based on the log ratio magnitudes + log_ratio_modulation = torch.sigmoid(logits / self.args.discopop_tau) + logistic_component = -F.logsigmoid(logits) + exp_component = torch.exp(-logits) + # Blend between logistic and exponential component based on log ratio modulation + losses = logistic_component * (1 - log_ratio_modulation) + exp_component * log_ratio_modulation + + else: + raise ValueError( + f"Unknown loss type: {self.loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'exo_pair', " + "'nca_pair', 'robust', 'bco_pair', 'sppo_hard', 'aot', 'aot_pair', 'discopop', 'apo_zero', 'apo_down']" + ) + + chosen_rewards = self.beta * (chosen_logps.to(device) - ref_chosen_logps.to(device)).detach() + rejected_rewards = self.beta * (rejected_logps.to(device) - ref_rejected_logps.to(device)).detach() + + return losses, chosen_rewards, rejected_rewards + + def _compute_loss_liger(self, model: nn.Module, batch: dict[str, Union[list, torch.LongTensor]]): + unwrapped_model = self.accelerator.unwrap_model(model) + concatenated_batch = self.concatenated_inputs(batch, padding_value=self.padding_value) + + model_kwargs = {} + if self.aux_loss_enabled: + model_kwargs["output_router_logits"] = True + + # Add the pixel values and attention masks for vision models + if "pixel_values" in concatenated_batch: + model_kwargs["pixel_values"] = concatenated_batch["pixel_values"] + if "pixel_attention_mask" in concatenated_batch: + model_kwargs["pixel_attention_mask"] = concatenated_batch["pixel_attention_mask"] + if "image_sizes" in concatenated_batch: + model_kwargs["image_sizes"] = concatenated_batch["image_sizes"] + + prompt_attention_mask = concatenated_batch["prompt_attention_mask"] + completion_attention_mask = concatenated_batch["completion_attention_mask"] + + if self.is_encoder_decoder: + # 1. Get encoder outputs + encoder_outputs = unwrapped_model.get_encoder()( + concatenated_batch["prompt_input_ids"], + attention_mask=concatenated_batch["prompt_attention_mask"], + return_dict=True, + ) + # 2. Prepare decoder inputs + decoder_input_ids = shift_tokens_right( + concatenated_batch["completion_input_ids"], + unwrapped_model.config.decoder_start_token_id, + ) + # 3. Get decoder outputs + decoder_outputs = unwrapped_model.get_decoder()( + input_ids=decoder_input_ids, + attention_mask=concatenated_batch["completion_attention_mask"], + encoder_hidden_states=encoder_outputs.last_hidden_state, + encoder_attention_mask=concatenated_batch["prompt_attention_mask"], + use_cache=False, + ) + hidden_states = decoder_outputs.last_hidden_state + + ref_hidden_states = None + if not self.reference_free and self.ref_model is not None: + unwrapped_ref_model = self.accelerator.unwrap_model(self.ref_model) + ref_encoder_outputs = unwrapped_ref_model.get_encoder()( + concatenated_batch["prompt_input_ids"], + attention_mask=concatenated_batch["prompt_attention_mask"], + return_dict=True, + ) + ref_decoder_outputs = unwrapped_ref_model.get_decoder()( + input_ids=decoder_input_ids, + attention_mask=concatenated_batch["completion_attention_mask"], + encoder_hidden_states=ref_encoder_outputs.last_hidden_state, + encoder_attention_mask=concatenated_batch["prompt_attention_mask"], + use_cache=False, + ) + ref_hidden_states = ref_decoder_outputs.last_hidden_state + elif not self.reference_free: + with self.null_ref_context(): + ref_encoder_outputs = unwrapped_model.get_encoder()( + concatenated_batch["prompt_input_ids"], + attention_mask=concatenated_batch["prompt_attention_mask"], + return_dict=True, + ) + ref_decoder_outputs = unwrapped_model.get_decoder()( + input_ids=decoder_input_ids, + attention_mask=concatenated_batch["completion_attention_mask"], + encoder_hidden_states=ref_encoder_outputs.last_hidden_state, + encoder_attention_mask=concatenated_batch["prompt_attention_mask"], + use_cache=False, + ) + ref_hidden_states = ref_decoder_outputs.last_hidden_state + + labels = concatenated_batch["completion_input_ids"] + loss_mask = completion_attention_mask.bool() + else: + # For decoder-only models + input_ids = torch.cat( + (concatenated_batch["prompt_input_ids"], concatenated_batch["completion_input_ids"]), dim=1 + ) + attention_mask = torch.cat( + (concatenated_batch["prompt_attention_mask"], concatenated_batch["completion_attention_mask"]), + dim=1, + ) + # Mask the prompt but not the completion for the loss + loss_mask = torch.cat( + (torch.zeros_like(prompt_attention_mask), completion_attention_mask), + dim=1, + ) + + # Flush and truncate + if self.max_length is not None and self.max_length < attention_mask.size(1): + if self.truncation_mode == "keep_start": + # Flush left to reduce the memory usage + # [[0, 0, x, x, x, x], -> [[x, x, x, x], + # [0, x, x, x, 0, 0]] [x, x, x, 0]] + attention_mask, input_ids, loss_mask = flush_left(attention_mask, input_ids, loss_mask) + attention_mask = attention_mask[:, : self.max_length] + input_ids = input_ids[:, : self.max_length] + loss_mask = loss_mask[:, : self.max_length] + elif self.truncation_mode == "keep_end": + # Flush right before truncating left, then flush left + # [[0, 0, x, x, x, x], -> [[0, 0, x, x], + # [0, x, x, x, 0, 0]] [0, x, x, x]] + attention_mask, input_ids, loss_mask = flush_right(attention_mask, input_ids, loss_mask) + input_ids = input_ids[:, -self.max_length :] + attention_mask = attention_mask[:, -self.max_length :] + loss_mask = loss_mask[:, -self.max_length :] + attention_mask, input_ids, loss_mask = flush_left(attention_mask, input_ids, loss_mask) + else: + raise ValueError( + f"Unknown truncation mode: '{self.truncation_mode}'. Should be one of ['keep_end', " + "'keep_start']." + ) + else: + # Flush left to reduce the memory usage + # [[0, 0, x, x, x, x], -> [[x, x, x, x], + # [0, x, x, x, 0, 0]] [x, x, x, 0]] + attention_mask, input_ids, loss_mask = flush_left(attention_mask, input_ids, loss_mask) + + # Add logits_to_keep optimization + if self.use_logits_to_keep: + first_compute_index = loss_mask.nonzero(as_tuple=True)[1].min() + logits_to_keep = (loss_mask.shape[1] - first_compute_index).item() + 1 + model_kwargs["logits_to_keep"] = logits_to_keep + + model_kwargs["output_hidden_states"] = True + + # Add padding-free training support + if self.padding_free: + input_ids = input_ids[attention_mask.bool()].unsqueeze(0) + loss_mask = loss_mask[attention_mask.bool()].unsqueeze(0) + position_ids = attention_mask.cumsum(1)[attention_mask.bool()].unsqueeze(0) - 1 + model_kwargs["position_ids"] = position_ids + else: + model_kwargs["attention_mask"] = attention_mask + + # Get the base model outputs (before LM head) + if hasattr(unwrapped_model, "get_decoder"): + base_model = unwrapped_model.get_decoder() + else: + base_model = getattr(unwrapped_model, self.args.base_model_attribute_name, unwrapped_model) + + outputs = base_model( + input_ids, + use_cache=False, + **model_kwargs, + ) + hidden_states = outputs.last_hidden_state[:, :-1] + + # Get reference hidden states if needed + ref_hidden_states = None + if not self.reference_free and self.ref_model is not None: + unwrapped_ref_model = self.accelerator.unwrap_model(self.ref_model) + if hasattr(unwrapped_ref_model, "get_decoder"): + ref_base_model = unwrapped_ref_model.get_decoder() + else: + ref_base_model = getattr( + unwrapped_ref_model, self.args.base_model_attribute_name, unwrapped_ref_model + ) + + ref_outputs = ref_base_model( + input_ids, + use_cache=False, + **model_kwargs, + ) + ref_hidden_states = ref_outputs.last_hidden_state[:, :-1] + elif not self.reference_free: + if hasattr(unwrapped_model, "get_decoder"): + ref_base_model = unwrapped_model.get_decoder() + else: + ref_base_model = getattr(unwrapped_model, self.args.base_model_attribute_name, unwrapped_model) + with self.null_ref_context(): + ref_outputs = ref_base_model( + input_ids, + use_cache=False, + **model_kwargs, + ) + ref_hidden_states = ref_outputs.last_hidden_state[:, :-1] + + masked_input_ids = torch.where(loss_mask != 0, input_ids, self.label_pad_token_id) + labels = masked_input_ids[:, 1:] # Shift right for casual LM + + # Get the LM head + lm_head = unwrapped_model.get_output_embeddings() + + # Get reference model weights if needed + ref_weight = None + ref_bias = None + if not self.reference_free: + if self.ref_model is not None: + unwrapped_ref_model = self.accelerator.unwrap_model(self.ref_model) + ref_lm_head = unwrapped_ref_model.get_output_embeddings() + else: + with self.null_ref_context(): + ref_lm_head = unwrapped_model.get_output_embeddings() + ref_weight = ref_lm_head.weight + ref_bias = ref_lm_head.bias if hasattr(ref_lm_head, "bias") else None + + # Compute loss using Liger kernel + loss_output = self.dpo_loss_fn( + lm_head.weight, + hidden_states, + labels, + bias=lm_head.bias if hasattr(lm_head, "bias") else None, + ref_input=ref_hidden_states if not self.reference_free else None, + ref_weight=ref_weight if not self.reference_free else None, + ref_bias=ref_bias if not self.reference_free else None, + ) + ( + loss, + (chosen_logps, rejected_logps, chosen_logits_mean, rejected_logits_mean, nll_loss, *aux_outputs), + ) = loss_output + + output = { + "loss": loss, + "chosen_logps": chosen_logps, + "rejected_logps": rejected_logps, + "mean_chosen_logits": chosen_logits_mean, + "mean_rejected_logits": rejected_logits_mean, + "nll_loss": nll_loss, + "chosen_rewards": aux_outputs[0], + "rejected_rewards": aux_outputs[1], + } + if self.aux_loss_enabled: + output["aux_loss"] = outputs.aux_loss + + return output + + def concatenated_forward( + self, model: nn.Module, batch: dict[str, Union[list, torch.LongTensor]], is_ref_model: bool = False + ): + """ + Runs the given model on the given batch of inputs, concatenating the chosen and rejected inputs together. + + We do this to avoid doing two forward passes, because it's faster for FSDP. + + Args: + model: + Model to run the forward pass on. + batch: + Batch of input data. + is_ref_model: + Whether this method is being called for the reference model. If `True`, length desensitization is not + applied. + """ + num_examples = batch["prompt_input_ids"].shape[0] + + concatenated_batch = self.concatenated_inputs(batch, padding_value=self.padding_value) + + model_kwargs = {"use_cache": False} + if self.aux_loss_enabled: + model_kwargs["output_router_logits"] = True + + # Add the pixel values and attention masks for vision models + if "pixel_values" in concatenated_batch: + model_kwargs["pixel_values"] = concatenated_batch["pixel_values"] + if "pixel_attention_mask" in concatenated_batch: + model_kwargs["pixel_attention_mask"] = concatenated_batch["pixel_attention_mask"] + if "image_sizes" in concatenated_batch: + model_kwargs["image_sizes"] = concatenated_batch["image_sizes"] + + prompt_input_ids = concatenated_batch["prompt_input_ids"] + prompt_attention_mask = concatenated_batch["prompt_attention_mask"] + completion_input_ids = concatenated_batch["completion_input_ids"] + completion_attention_mask = concatenated_batch["completion_attention_mask"] + if self.is_encoder_decoder: + labels = completion_input_ids + labels[completion_attention_mask == 0] = self.label_pad_token_id + outputs = model( + input_ids=prompt_input_ids, + attention_mask=prompt_attention_mask, + labels=labels, # we need the labels for the logits to be returned + **model_kwargs, + ) + logits = outputs.logits + loss_mask = completion_attention_mask.bool() + else: + # Concatenate the prompt and completion inputs + input_ids = torch.cat((prompt_input_ids, completion_input_ids), dim=1) + attention_mask = torch.cat((prompt_attention_mask, completion_attention_mask), dim=1) + # Mask the prompt but not the completion for the loss + loss_mask = torch.cat( + (torch.zeros_like(prompt_attention_mask), completion_attention_mask), + dim=1, + ) + + # Flush and truncate + if self.max_length is not None and self.max_length < attention_mask.size(1): + if self.truncation_mode == "keep_start": + # Flush left to reduce the memory usage + # [[0, 0, x, x, x, x], -> [[x, x, x, x], + # [0, x, x, x, 0, 0]] [x, x, x, 0]] + attention_mask, input_ids, loss_mask = flush_left(attention_mask, input_ids, loss_mask) + attention_mask = attention_mask[:, : self.max_length] + input_ids = input_ids[:, : self.max_length] + loss_mask = loss_mask[:, : self.max_length] + elif self.truncation_mode == "keep_end": + # Flush right before truncating left, then flush left + # [[0, 0, x, x, x, x], -> [[0, 0, x, x], + # [0, x, x, x, 0, 0]] [0, x, x, x]] + attention_mask, input_ids, loss_mask = flush_right(attention_mask, input_ids, loss_mask) + input_ids = input_ids[:, -self.max_length :] + attention_mask = attention_mask[:, -self.max_length :] + loss_mask = loss_mask[:, -self.max_length :] + attention_mask, input_ids, loss_mask = flush_left(attention_mask, input_ids, loss_mask) + else: + raise ValueError( + f"Unknown truncation mode: '{self.truncation_mode}'. Should be one of ['keep_end', " + "'keep_start']." + ) + else: + # Flush left to reduce the memory usage + # [[0, 0, x, x, x, x], -> [[x, x, x, x], + # [0, x, x, x, 0, 0]] [x, x, x, 0]] + attention_mask, input_ids, loss_mask = flush_left(attention_mask, input_ids, loss_mask) + + if self.use_logits_to_keep: + # Compute logits_to_keep based on loss_mask pattern: + # [[0, 0, 0, x, x, x, x], + # [0, 0, 0, x, x, x, 0]] + # ^ start computing logits from here ([:, -(7-3+1):]) + first_compute_index = loss_mask.nonzero(as_tuple=True)[1].min() + logits_to_keep = (loss_mask.shape[1] - first_compute_index).item() + 1 # +1 for the first label + model_kwargs["logits_to_keep"] = logits_to_keep + + model_kwargs["output_hidden_states"] = True + + if self.padding_free: + # Flatten the input_ids, position_ids, and loss_mask + # input_ids = [[a, b, c, 0], -> input_ids = [[a, b, c, d, e, f, g]] + # [d, e, f, g]] position_ids = [[0, 1, 2, 0, 1, 2, 3]] + input_ids = input_ids[attention_mask.bool()].unsqueeze(0) + loss_mask = loss_mask[attention_mask.bool()].unsqueeze(0) + position_ids = attention_mask.cumsum(1)[attention_mask.bool()].unsqueeze(0) - 1 + model_kwargs["position_ids"] = position_ids + else: + model_kwargs["attention_mask"] = attention_mask + + outputs = model(input_ids, **model_kwargs) + logits = outputs.logits + + # Offset the logits by one to align with the labels + labels = torch.roll(input_ids, shifts=-1, dims=1) + loss_mask = torch.roll(loss_mask, shifts=-1, dims=1).bool() + + if self.use_logits_to_keep: + # Align labels with logits + # logits: -, -, [x2, x3, x4, x5, x6] + # ^ --------- ^ after logits[:, :-1, :] + # labels: [y0, y1, y2, y3, y4, y5, y6] + # ^ --------- ^ with logits_to_keep=4, [:, -4:] + # loss_mask: [0, 0, 0, 1, 1, 1, 1] + labels = labels[:, -logits_to_keep:] + loss_mask = loss_mask[:, -logits_to_keep:] + + if logits.shape[:2] != labels.shape[:2]: + # for llava, the returned logits include the image tokens (placed before the text tokens) + seq_len = labels.shape[1] + logits = logits[:, -seq_len:] + + # Compute the log probabilities of the labels + labels[~loss_mask] = 0 # dummy token; we'll ignore the losses on these tokens later + per_token_logps = selective_log_softmax(logits, labels) + per_token_logps[~loss_mask] = 0 + per_token_logps = torch.roll(per_token_logps, shifts=1, dims=1) + + if self.padding_free: + # Unflatten the per_token_logps (shape: [1, sum_seq_len] -> [batch_size, seq_len]) + batch_size, seq_len = attention_mask.shape + per_token_logps_ = torch.zeros( + batch_size, seq_len, device=outputs.logits.device, dtype=outputs.logits.dtype + ) + per_token_logps_[attention_mask.bool()] = per_token_logps + per_token_logps = per_token_logps_ + + all_logps = per_token_logps[:, 1:].sum(-1) + + output = {} + + if self.use_weighting: + with torch.no_grad(): + # Eq (2) of the WPO paper: https://huggingface.co/papers/2406.11827 + logprobs = F.log_softmax(logits, dim=-1) + weights_adjustment_factor = torch.logsumexp(2 * logprobs, dim=-1) # same as sum(probs**2) in log space + per_token_logps_adjusted = per_token_logps - weights_adjustment_factor + all_weights = (per_token_logps_adjusted * loss_mask).sum(-1) / loss_mask.sum(-1) + chosen_weights = all_weights[:num_examples] + rejected_weights = all_weights[num_examples:] + output["policy_weights"] = torch.clamp(torch.exp(chosen_weights + rejected_weights), max=1) + + if self.args.rpo_alpha is not None: + # Only use the chosen logits for the RPO loss + chosen_logits = logits[:num_examples, :-1] if not self.is_encoder_decoder else logits[:num_examples] + chosen_labels = labels[:num_examples, :-1] if not self.is_encoder_decoder else labels[:num_examples] + + # Compute the log probabilities of the labels + output["nll_loss"] = F.cross_entropy( + torch.flatten(chosen_logits, end_dim=1), torch.flatten(chosen_labels, end_dim=1), ignore_index=0 + ) + + if self.loss_type == "ipo": + all_logps = all_logps / loss_mask.sum(-1) + + if self.args.ld_alpha is not None and not is_ref_model: + # Compute response lengths based on loss_mask + completion_lengths = loss_mask.sum(dim=1) + + chosen_lengths = completion_lengths[:num_examples] + rejected_lengths = completion_lengths[num_examples:] + public_lengths = torch.min(chosen_lengths, rejected_lengths) # l_p in the paper + public_lengths = torch.cat([public_lengths, public_lengths], dim=0) + + seq_len = per_token_logps.size(1) + position_ids = torch.arange(seq_len, device=per_token_logps.device).expand_as(per_token_logps) + + ld_mask = position_ids < public_lengths.unsqueeze(1) + mask = position_ids < completion_lengths.unsqueeze(1) + + front_mask = (ld_mask & mask).float() + rear_mask = (~ld_mask & mask).float() + front_logps = (per_token_logps * front_mask).sum(dim=1) + rear_logps = (per_token_logps * rear_mask).sum(dim=1) + + all_logps = front_logps + self.args.ld_alpha * rear_logps + + output["chosen_logps"] = all_logps[:num_examples] + output["rejected_logps"] = all_logps[num_examples:] + + # Compute the mean logits + if self.padding_free: + # position_ids contains a sequence of range identifiers (e.g., [[0, 1, 2, 0, 1, 2, 3, ...]]). + # There are 2*num_examples ranges in total: the first half corresponds to the chosen tokens, + # and the second half to the rejected tokens. + # To find the start of the rejected tokens, we look for the num_examples+1-th zero in pos_id. + split_idx = (position_ids == 0).nonzero(as_tuple=True)[1][num_examples] + mean_chosen_logits = logits[0, :split_idx][loss_mask[0, :split_idx]].mean() + mean_rejected_logits = logits[0, split_idx:][loss_mask[0, split_idx:]].mean() + else: + mean_chosen_logits = logits[:num_examples][loss_mask[:num_examples]].mean() + mean_rejected_logits = logits[num_examples:][loss_mask[num_examples:]].mean() + + output["mean_chosen_logits"] = mean_chosen_logits + output["mean_rejected_logits"] = mean_rejected_logits + + if self.aux_loss_enabled: + output["aux_loss"] = outputs.aux_loss + + return output + + def get_batch_loss_metrics( + self, + model, + batch: dict[str, Union[list, torch.LongTensor]], + train_eval: Literal["train", "eval"] = "train", + ): + """Compute the DPO loss and other metrics for the given batch of inputs for train or test.""" + metrics = {} + + if self.args.use_liger_loss: + model_output = self._compute_loss_liger(model, batch) + losses = model_output["loss"] + chosen_rewards = model_output["chosen_rewards"] + rejected_rewards = model_output["rejected_rewards"] + else: + model_output = self.concatenated_forward(model, batch) + + # if ref_chosen_logps and ref_rejected_logps in batch use them, otherwise use the reference model + if "ref_chosen_logps" in batch and "ref_rejected_logps" in batch: + ref_chosen_logps = batch["ref_chosen_logps"] + ref_rejected_logps = batch["ref_rejected_logps"] + else: + ref_chosen_logps, ref_rejected_logps = self.compute_ref_log_probs(batch) + + losses, chosen_rewards, rejected_rewards = self.dpo_loss( + model_output["chosen_logps"], model_output["rejected_logps"], ref_chosen_logps, ref_rejected_logps + ) + reward_accuracies = (chosen_rewards > rejected_rewards).float() + + if self.args.rpo_alpha is not None: + losses = losses + self.args.rpo_alpha * model_output["nll_loss"] # RPO loss from V3 of the paper + + if self.use_weighting: + losses = losses * model_output["policy_weights"] + + if self.aux_loss_enabled: + losses = losses + self.aux_loss_coef * model_output["aux_loss"] + + prefix = "eval_" if train_eval == "eval" else "" + metrics[f"{prefix}rewards/chosen"] = self.accelerator.gather_for_metrics(chosen_rewards).mean().item() + metrics[f"{prefix}rewards/rejected"] = self.accelerator.gather_for_metrics(rejected_rewards).mean().item() + metrics[f"{prefix}rewards/accuracies"] = self.accelerator.gather_for_metrics(reward_accuracies).mean().item() + metrics[f"{prefix}rewards/margins"] = ( + self.accelerator.gather_for_metrics(chosen_rewards - rejected_rewards).mean().item() + ) + metrics[f"{prefix}logps/chosen"] = ( + self.accelerator.gather_for_metrics(model_output["chosen_logps"]).detach().mean().item() + ) + metrics[f"{prefix}logps/rejected"] = ( + self.accelerator.gather_for_metrics(model_output["rejected_logps"]).detach().mean().item() + ) + metrics[f"{prefix}logits/chosen"] = ( + self.accelerator.gather_for_metrics(model_output["mean_chosen_logits"]).detach().mean().item() + ) + metrics[f"{prefix}logits/rejected"] = ( + self.accelerator.gather_for_metrics(model_output["mean_rejected_logits"]).detach().mean().item() + ) + if self.args.rpo_alpha is not None: + metrics[f"{prefix}nll_loss"] = ( + self.accelerator.gather_for_metrics(model_output["nll_loss"]).detach().mean().item() + ) + if self.aux_loss_enabled: + metrics[f"{prefix}aux_loss"] = ( + self.accelerator.gather_for_metrics(model_output["aux_loss"]).detach().mean().item() + ) + + return losses.mean(), metrics + + def compute_loss( + self, + model: Union[PreTrainedModel, nn.Module], + inputs: dict[str, Union[torch.Tensor, Any]], + return_outputs=False, + num_items_in_batch=None, + ) -> Union[torch.Tensor, tuple[torch.Tensor, dict[str, torch.Tensor]]]: + compute_loss_context_manager = ( + autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext() + ) + with compute_loss_context_manager: + loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="train") + + # Make sure to move the loss to the device the original accumulating loss is at back in the `Trainer` class: + loss = loss.to(self.args.device) + # force log the metrics + self.store_metrics(metrics, train_eval="train") + + if return_outputs: + return loss, metrics + + return loss + + def generate_from_model_and_ref(self, model, batch: dict[str, torch.LongTensor]) -> tuple[str, str]: + """Generate samples from the model and reference model for the given batch of inputs.""" + + # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with + # the torch amp context manager as some hidden states are silently casted to full precision. + generate_context_manager = ( + autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext() + ) + + with generate_context_manager: + policy_output = model.generate( + input_ids=batch["prompt_input_ids"], + attention_mask=batch["prompt_attention_mask"], + max_length=self.max_length, + do_sample=True, + pad_token_id=self.padding_value, + ) + + # if ref_output in batch use that otherwise use the reference model + if "ref_output" in batch: + ref_output = batch["ref_output"] + else: + if self.ref_model is None: + with self.null_ref_context(): + ref_output = self.model.generate( + input_ids=batch["prompt_input_ids"], + attention_mask=batch["prompt_attention_mask"], + max_length=self.max_length, + do_sample=True, + pad_token_id=self.padding_value, + ) + else: + ref_output = self.ref_model.generate( + input_ids=batch["prompt_input_ids"], + attention_mask=batch["prompt_attention_mask"], + max_length=self.max_length, + do_sample=True, + pad_token_id=self.padding_value, + ) + + policy_output = pad_to_length(policy_output, self.max_length, self.padding_value) + policy_output_decoded = self.processing_class.batch_decode(policy_output, skip_special_tokens=True) + + ref_output = pad_to_length(ref_output, self.max_length, self.padding_value) + ref_output_decoded = self.processing_class.batch_decode(ref_output, skip_special_tokens=True) + + return policy_output_decoded, ref_output_decoded + + def prediction_step( + self, + model: Union[PreTrainedModel, nn.Module], + inputs: dict[str, Union[torch.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[list[str]] = None, + ): + if ignore_keys is None: + if hasattr(model, "config"): + ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", []) + else: + ignore_keys = [] + + prediction_context_manager = ( + autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext() + ) + + with torch.no_grad(), prediction_context_manager: + loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="eval") + + # force log the metrics + self.store_metrics(metrics, train_eval="eval") + + if prediction_loss_only: + return loss.detach(), None, None + + # logits for the chosen and rejected samples from model + logits_dict = { + "eval_logits/chosen": metrics["eval_logits/chosen"], + "eval_logits/rejected": metrics["eval_logits/rejected"], + } + logits = [v for k, v in logits_dict.items() if k not in ignore_keys] + logits = torch.tensor(logits, device=self.accelerator.device) + labels = torch.zeros(logits.shape[0], device=self.accelerator.device) + + return (loss.detach(), logits, labels) + + def store_metrics(self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None: + for key, value in metrics.items(): + self._stored_metrics[train_eval][key].append(value) + + def evaluation_loop( + self, + dataloader: DataLoader, + description: str, + prediction_loss_only: Optional[bool] = None, + ignore_keys: Optional[list[str]] = None, + metric_key_prefix: str = "eval", + ) -> EvalLoopOutput: + """ + Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by + `Trainer.evaluate()` and `Trainer.predict()`. + + Works both with or without labels. + """ + + # Sample and save to game log if requested (for one batch to save time) + if self.generate_during_eval: + # Generate random indices within the range of the total number of samples + num_samples = len(dataloader.dataset) + random_indices = random.sample(range(num_samples), k=self.args.eval_batch_size) + + # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader + random_batch_dataset = dataloader.dataset.select(random_indices) + random_batch = self.data_collator(random_batch_dataset) + random_batch = self._prepare_inputs(random_batch) + + policy_output_decoded, ref_output_decoded = self.generate_from_model_and_ref(self.model, random_batch) + + table = pd.DataFrame( + columns=["Prompt", "Policy", "Ref Model"], + data=[ + [prompt, pol[len(prompt) :], ref[len(prompt) :]] + for prompt, pol, ref in zip( + random_batch_dataset["prompt"], policy_output_decoded, ref_output_decoded + ) + ], + ) + if "wandb" in self.args.report_to and self.accelerator.is_main_process: + wandb.log({"game_log": wandb.Table(data=table)}) + + if "comet_ml" in self.args.report_to: + log_table_to_comet_experiment( + name="game_log.csv", + table=table, + ) + + # Base evaluation + initial_output = super().evaluation_loop( + dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix + ) + + return initial_output + + def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None: + """ + Log `logs` on the various objects watching training, including stored metrics. + + Args: + logs (`dict[str, float]`): + The values to log. + start_time (`float` or `None`, *optional*, defaults to `None`): + Start time of the training. + """ + # logs either has 'loss' or 'eval_loss' + train_eval = "train" if "loss" in logs else "eval" + # Add averaged stored metrics to logs + for key, metrics in self._stored_metrics[train_eval].items(): + logs[key] = torch.tensor(metrics).mean().item() + del self._stored_metrics[train_eval] + return super().log(logs, start_time) + + # Ensure the model card is saved along with the checkpoint + def _save_checkpoint(self, model, trial): + if self.args.hub_model_id is None: + model_name = Path(self.args.output_dir).name + else: + model_name = self.args.hub_model_id.split("/")[-1] + self.create_model_card(model_name=model_name) + super()._save_checkpoint(model, trial) + + def create_model_card( + self, + model_name: Optional[str] = None, + dataset_name: Optional[str] = None, + tags: Union[str, list[str], None] = None, + ): + """ + Creates a draft of a model card using the information available to the `Trainer`. + + Args: + model_name (`str` or `None`, *optional*, defaults to `None`): + Name of the model. + dataset_name (`str` or `None`, *optional*, defaults to `None`): + Name of the dataset used for training. + tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`): + Tags to be associated with the model card. + """ + if not self.is_world_process_zero(): + return + + if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path): + base_model = self.model.config._name_or_path + else: + base_model = None + + # normalize `tags` to a mutable set + if tags is None: + tags = set() + elif isinstance(tags, str): + tags = {tags} + else: + tags = set(tags) + + if hasattr(self.model.config, "unsloth_version"): + tags.add("unsloth") + + tags.update(self._tag_names) + + citation = textwrap.dedent( + """\ + @inproceedings{rafailov2023direct, + title = {{Direct Preference Optimization: Your Language Model is Secretly a Reward Model}}, + author = {Rafael Rafailov and Archit Sharma and Eric Mitchell and Christopher D. Manning and Stefano Ermon and Chelsea Finn}, + year = 2023, + booktitle = {Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023}, + url = {http://papers.nips.cc/paper_files/paper/2023/hash/a85b405ed65c6477a4fe8302b5e06ce7-Abstract-Conference.html}, + editor = {Alice Oh and Tristan Naumann and Amir Globerson and Kate Saenko and Moritz Hardt and Sergey Levine}, + }""" + ) + + model_card = generate_model_card( + base_model=base_model, + model_name=model_name, + hub_model_id=self.hub_model_id, + dataset_name=dataset_name, + tags=tags, + wandb_url=wandb.run.get_url() if is_wandb_available() and wandb.run is not None else None, + comet_url=get_comet_experiment_url(), + trainer_name="DPO", + trainer_citation=citation, + paper_title="Direct Preference Optimization: Your Language Model is Secretly a Reward Model", + paper_id="2305.18290", + ) + + model_card.save(os.path.join(self.args.output_dir, "README.md")) +class UnslothDPOTrainer(_UnslothDPOTrainer): + """ + + Trainer for Direct Preference Optimization (DPO) method. + + This class is a wrapper around the [`transformers.Trainer`] class and inherits all of its attributes and methods. + + Args: + model (`Union[str, PreTrainedModel]`): + Model to be trained. Can be either: + + - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a + path to a *directory* containing model weights saved using + [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded + using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in + `args.model_init_kwargs`. + - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported. + ref_model (`PreTrainedModelWrapper`): + Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation + and loss. If no reference model is provided, the trainer will create a reference model with the same + architecture as the model to be optimized. + args ([`DPOConfig`], *optional*, defaults to `None`): + Configuration for this trainer. If `None`, a default configuration is used. + data_collator (`DataCollator`, *optional*): + Function to use to form a batch from a list of elements of the processed `train_dataset` or `eval_dataset`. + Will default to [`DataCollatorForPreference`]. + train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]): + Dataset to use for training. DPO supports [preference](#preference) type and. The format of the samples can + be either: + + - [Standard](dataset_formats#standard): Each sample contains plain text. + - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role + and content). + eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`): + Dataset to use for evaluation. It must meet the same requirements as `train_dataset`. + processing_class ([`~transformers.PreTrainedTokenizerBase`], *optional*, defaults to `None`): + Processing class used to process the data. If `None`, the processing class is loaded from the model's name + with [`~transformers.AutoTokenizer.from_pretrained`]. + compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*): + The function that will be used to compute metrics at evaluation. Must take a [`EvalPrediction`] and return + a dictionary string to metric values. *Note* When passing TrainingArgs with `batch_eval_metrics` set to + `True`, your compute_metrics function must take a boolean `compute_result` argument. This will be triggered + after the last eval batch to signal that the function needs to calculate and return the global summary + statistics rather than accumulating the batch-level statistics. + callbacks (list of [`~transformers.TrainerCallback`], *optional*, defaults to `None`): + List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed + in [here](https://huggingface.co/docs/transformers/main_classes/callback). + + If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`] + method. + optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`): + A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your + model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`. + optimizer_cls_and_kwargs (`Tuple[Type[torch.optim.Optimizer], Dict[str, Any]]`, *optional*, defaults to `None`): + A tuple containing the optimizer class and keyword arguments to use. Overrides `optim` and `optim_args` in + `args`. Incompatible with the `optimizers` argument. + preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*, defaults to `None`): + A function that preprocess the logits right before caching them at each evaluation step. Must take two + tensors, the logits and the labels, and return the logits once processed as desired. The modifications made + by this function will be reflected in the predictions received by `compute_metrics`. + + Note that the labels (second parameter) will be `None` if the dataset does not have them. + peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`): + PEFT configuration used to wrap the model. If `None`, the model is not wrapped. + + """ + def __init__( + self, + model, + ref_model = None, + args = None, + data_collator = None, + train_dataset = None, + eval_dataset = None, + processing_class = None, + compute_metrics = None, + callbacks = None, + optimizer_cls_and_kwargs = None, + preprocess_logits_for_metrics = None, + peft_config = None, + **kwargs + ): + if args is None: args = UnslothDPOConfig() + use_bf16 = getattr(args, 'bf16', False) + if type(use_bf16) is not bool: use_bf16 = False + use_fp16 = getattr(args, 'fp16', False) + if type(use_fp16) is not bool: use_fp16 = False + force_float32 = False + if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1': + print('Unsloth: Switching to float32 training since model cannot work with float16') + force_float32 = True + mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') + dtype = getattr(model.config, 'torch_dtype', None) + if dtype is None: dtype = model.get_input_embeddings().dtype + from unsloth_zoo.utils import _get_dtype + dtype = _get_dtype(dtype) + float16 = dtype == torch.float16 + if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`') + if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`') + if force_float32: + args.fp16 = False + args.bf16 = False + os.environ['ACCELERATE_MIXED_PRECISION'] = 'no' + elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32': + args.fp16 = float16 + args.bf16 = not float16 + os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16' + if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no': + args.eval_strategy = 'steps' + if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1 + ga_steps = getattr(args, 'gradient_accumulation_steps', None) + if ga_steps is not None and ga_steps > 1: + from transformers import __version__ as transformers_version + if Version(transformers_version) <= Version('4.45.2'): + print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n' + '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`') + if getattr(args, 'eval_strategy', 'no') != 'no': + eval_bsz = getattr(args, 'per_device_eval_batch_size', 8) + if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size + if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps + fp16_full_eval = getattr(args, 'fp16_full_eval', False) + if type(fp16_full_eval) is not bool: fp16_full_eval = False + bf16_full_eval = getattr(args, 'bf16_full_eval', False) + if type(bf16_full_eval) is not bool: bf16_full_eval = False + if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True + if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False + if force_float32: + args.bf16_full_eval = False + args.fp16_full_eval = False + elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16': + args.bf16_full_eval = True + args.fp16_full_eval = False + elif not bf16_full_eval and not fp16_full_eval: + args.bf16_full_eval = args.bf16 + args.fp16_full_eval = args.fp16 + _output_logits = False + if locals().get('compute_metrics', None) is not None: _output_logits = True + if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True + if _output_logits: + os.environ['UNSLOTH_RETURN_LOGITS'] = '1' + if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'): + pass + else: + model_max_seq_length = getattr(model, 'max_seq_length', None) + args_max_seq_length = getattr(args, 'max_seq_length', None) + if args_max_seq_length is None and model_max_seq_length is not None: + max_seq_length = model.max_seq_length + if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length + if model is not None and hasattr(model, 'for_training'): + model.for_training() + if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right' + if 'processing_class' in locals(): + if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right' + if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right' + __tokenizer = processing_class if 'processing_class' in locals() else tokenizer + from unsloth_zoo.vision_utils import UnslothVisionDataCollator + if not isinstance(data_collator, UnslothVisionDataCollator): + if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer, mlm = False, mlm_probability = 0.0) + elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names: + data_collator = DataCollatorForSeq2Seq(__tokenizer) + else: + if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False + if hasattr(args, 'dataset_text_field'): args.dataset_text_field = '' + if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True} + if not isinstance(data_collator, UnslothVisionDataCollator): + if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'): + if isinstance(data_collator, DataCollatorForSeq2Seq): + data_collator = DataCollatorForSeq2Seq(__tokenizer.tokenizer) + else: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer.tokenizer, mlm = False, mlm_probability = 0.0) + other_metrics = [] + + from unsloth_zoo.logging_utils import PatchRLStatistics + PatchRLStatistics('dpo_trainer', other_metrics) + if hasattr(train_dataset, 'column_names'): + column_names = set(train_dataset.column_names) + check = ['chosen', 'rejected', 'prompt', 'chosen_input_ids', 'chosen_attention_mask', + 'chosen_labels', 'rejected_input_ids', 'rejected_attention_mask', 'rejected_labels', + 'prompt_input_ids', 'prompt_attention_mask'] + if all(x in column_names for x in check): + train_dataset = train_dataset.remove_columns(['chosen', 'rejected', 'prompt']) + del check, column_names + + super().__init__( + model = model, + ref_model = ref_model, + args = args, + data_collator = data_collator, + train_dataset = train_dataset, + eval_dataset = eval_dataset, + processing_class = processing_class, + compute_metrics = compute_metrics, + callbacks = callbacks, + optimizer_cls_and_kwargs = optimizer_cls_and_kwargs, + preprocess_logits_for_metrics = preprocess_logits_for_metrics, + peft_config = peft_config,**kwargs) + if hasattr(self, 'neftune_hook_handle'): + self.neftune_hook_handle.remove() + if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle + if getattr(args, 'neftune_noise_alpha', None) is not None: + model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha + pass + +pass diff --git a/unsloth_compiled_cache/UnslothGKDTrainer.py b/unsloth_compiled_cache/UnslothGKDTrainer.py new file mode 100644 index 0000000000000000000000000000000000000000..f4efb224e9448ca58ed8bd135e10331edc5d5d4e --- /dev/null +++ b/unsloth_compiled_cache/UnslothGKDTrainer.py @@ -0,0 +1,860 @@ +""" +2025.7.4 +2025.7.3 +4.53.2 +0.19.1 +__UNSLOTH_VERSIONING__ +""" +from torch import Tensor +import torch +import torch.nn as nn +from torch.nn import functional as F +from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable +from trl.trainer.gkd_trainer import (Any, AutoModelForCausalLM, BaseImageProcessor, Callable, DataCollator, DataCollatorForChatML, Dataset, EvalPrediction, F, FeatureExtractionMixin, GKDConfig, GKDTrainer, GenerationConfig, Optional, PeftConfig, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, SFTTrainer, TrainerCallback, Union, disable_dropout_in_model, empty_cache, generate_model_card, get_comet_experiment_url, is_wandb_available, nn, os, prepare_deepspeed, random, textwrap, torch, unwrap_model_for_generation, wandb) + + +import os +from typing import * +from dataclasses import dataclass, field +from packaging.version import Version +import torch +import numpy as np +from contextlib import nullcontext +from torch.nn import functional as F +from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling + +torch_compile_options = { + "epilogue_fusion" : True, + "max_autotune" : False, + "shape_padding" : True, + "trace.enabled" : False, + "triton.cudagraphs" : False, +} + +@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) +def selective_log_softmax(logits, index): + logits = logits.to(torch.float32) + selected_logits = torch.gather(logits, dim = -1, index = index.unsqueeze(-1)).squeeze(-1) + # loop to reduce peak mem consumption + # logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits]) + logsumexp_values = torch.logsumexp(logits, dim = -1) + per_token_logps = selected_logits - logsumexp_values # log_softmax(x_i) = x_i - logsumexp(x) + return per_token_logps +@dataclass +class UnslothGKDConfig(GKDConfig): + """ + + Configuration class for [`GKDTrainer`]. + + This class includes only the parameters that are specific to GKD training. For a full list of training arguments, + please refer to the [`~transformers.TrainingArguments`] and [`SFTConfig`] documentation. + + Args: + temperature (`float`, *optional*, defaults to `0.9`): + Temperature for sampling. The higher the temperature, the more random the completions. + lmbda (`float`, *optional*, defaults to `0.5`): + Lambda parameter that controls the student data fraction (i.e., the proportion of on-policy + student-generated outputs). + beta (`float`, *optional*, defaults to `0.5`): + Interpolation coefficient between `0.0` and `1.0` of the Generalized Jensen-Shannon Divergence loss. When + beta is `0.0`, the loss is the KL divergence. When beta is `1.0`, the loss is the Inverse KL Divergence. + max_new_tokens (`int`, *optional*, defaults to `128`): + Maximum number of tokens to generate per completion. + teacher_model_name_or_path (`str` or `None`, *optional*, defaults to `None`): + Model name or path of the teacher model. If `None`, the teacher model will be the same as the model being + trained. + teacher_model_init_kwargs (`dict[str, Any]]` or `None`, *optional*, defaults to `None`): + Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the teacher model + from a string. + disable_dropout (`bool`, *optional*, defaults to `True`): + Whether to disable dropout in the model. + seq_kd (`bool`, *optional*, defaults to `False`): + Seq_kd parameter that controls whether to perform Sequence-Level KD (can be viewed as supervised FT on + teacher-generated output). + + """ + vllm_sampling_params: Optional[Any] = field( + default = None, + metadata = {'help': 'vLLM SamplingParams'}, + ) + unsloth_num_chunks : Optional[int] = field( + default = -1, + metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'}, + ) + def __init__( + self, + output_dir = None, + overwrite_output_dir = None, + do_train = False, + do_eval = False, + do_predict = False, + eval_strategy = 'no', + prediction_loss_only = False, + per_device_train_batch_size = 4, + per_device_eval_batch_size = 4, + per_gpu_train_batch_size = None, + per_gpu_eval_batch_size = None, + gradient_accumulation_steps = 2, + eval_accumulation_steps = 2, + eval_delay = 0, + torch_empty_cache_steps = 250, + learning_rate = 5e-05, + weight_decay = 0.01, + adam_beta1 = 0.9, + adam_beta2 = 0.999, + adam_epsilon = 1e-08, + max_grad_norm = 1.0, + num_train_epochs = 3.0, + max_steps = -1, + lr_scheduler_type = 'linear', + warmup_ratio = 0.1, + warmup_steps = 0, + log_level = 'passive', + log_level_replica = 'warning', + log_on_each_node = True, + logging_dir = None, + logging_strategy = 'steps', + logging_first_step = False, + logging_steps = 1, + logging_nan_inf_filter = False, + save_strategy = 'steps', + save_steps = 500, + save_total_limit = None, + save_safetensors = True, + save_on_each_node = False, + save_only_model = False, + restore_callback_states_from_checkpoint = False, + no_cuda = False, + use_cpu = False, + use_mps_device = False, + seed = 3407, + data_seed = 3407, + jit_mode_eval = False, + use_ipex = False, + bf16 = False, + fp16 = False, + fp16_opt_level = 'O1', + half_precision_backend = 'auto', + bf16_full_eval = False, + fp16_full_eval = False, + tf32 = None, + local_rank = -1, + ddp_backend = None, + tpu_num_cores = None, + tpu_metrics_debug = False, + debug = '', + dataloader_drop_last = False, + eval_steps = None, + dataloader_num_workers = 0, + dataloader_prefetch_factor = None, + past_index = -1, + run_name = None, + disable_tqdm = None, + remove_unused_columns = True, + label_names = None, + load_best_model_at_end = False, + metric_for_best_model = None, + greater_is_better = None, + ignore_data_skip = False, + fsdp = '', + fsdp_min_num_params = 0, + fsdp_config = None, + fsdp_transformer_layer_cls_to_wrap = None, + accelerator_config = None, + deepspeed = None, + label_smoothing_factor = 0.0, + optim = 'adamw_8bit', + optim_args = None, + adafactor = False, + group_by_length = False, + length_column_name = 'length', + report_to = None, + ddp_find_unused_parameters = None, + ddp_bucket_cap_mb = None, + ddp_broadcast_buffers = None, + dataloader_pin_memory = True, + dataloader_persistent_workers = False, + skip_memory_metrics = True, + use_legacy_prediction_loop = False, + push_to_hub = False, + resume_from_checkpoint = None, + hub_model_id = None, + hub_strategy = 'every_save', + hub_token = None, + hub_private_repo = None, + hub_always_push = False, + hub_revision = None, + gradient_checkpointing = False, + gradient_checkpointing_kwargs = None, + include_inputs_for_metrics = False, + eval_do_concat_batches = True, + fp16_backend = 'auto', + push_to_hub_model_id = None, + push_to_hub_organization = None, + push_to_hub_token = None, + mp_parameters = '', + auto_find_batch_size = False, + full_determinism = False, + torchdynamo = None, + ray_scope = 'last', + ddp_timeout = 1800, + torch_compile = False, + torch_compile_backend = None, + torch_compile_mode = None, + include_tokens_per_second = False, + include_num_input_tokens_seen = False, + neftune_noise_alpha = None, + optim_target_modules = None, + batch_eval_metrics = False, + eval_on_start = False, + use_liger_kernel = False, + liger_kernel_config = None, + eval_use_gather_object = False, + average_tokens_across_devices = True, + model_init_kwargs = None, + chat_template_path = None, + dataset_text_field = 'text', + dataset_kwargs = None, + dataset_num_proc = None, + eos_token = None, + pad_token = None, + max_length = 1024, + packing = False, + packing_strategy = 'ffd', + padding_free = False, + pad_to_multiple_of = None, + eval_packing = None, + completion_only_loss = None, + assistant_only_loss = False, + activation_offloading = False, + max_seq_length = None, + temperature = 0.9, + lmbda = 0.5, + beta = 0.5, + max_new_tokens = 128, + teacher_model_name_or_path = None, + teacher_model_init_kwargs = None, + disable_dropout = True, + seq_kd = False, + vllm_sampling_params = None, + unsloth_num_chunks = -1, + **kwargs, + ): + if learning_rate < 1e-7: raise FloatingPointError(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!') + if learning_rate > 1: raise OverflowError(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!') + if output_dir is None and save_strategy == 'steps' and save_steps == 500: + output_dir = 'unsloth_training_checkpoints' + save_strategy = 'no' + if dataset_num_proc is None: + from multiprocessing import cpu_count + dataset_num_proc = cpu_count() + if temperature <= 0: + raise MathError('Unsloth: Please set a positive non-zero temperature since your results will be wrong.') + elif temperature >= 10: + raise MathError('Unsloth: Please set a positive non-zero temperature less than 10, since sampling will be quite erratic.') + + + super().__init__( + output_dir = output_dir, + overwrite_output_dir = overwrite_output_dir, + do_train = do_train, + do_eval = do_eval, + do_predict = do_predict, + eval_strategy = eval_strategy, + prediction_loss_only = prediction_loss_only, + per_device_train_batch_size = per_device_train_batch_size, + per_device_eval_batch_size = per_device_eval_batch_size, + per_gpu_train_batch_size = per_gpu_train_batch_size, + per_gpu_eval_batch_size = per_gpu_eval_batch_size, + gradient_accumulation_steps = gradient_accumulation_steps, + eval_accumulation_steps = eval_accumulation_steps, + eval_delay = eval_delay, + torch_empty_cache_steps = torch_empty_cache_steps, + learning_rate = learning_rate, + weight_decay = weight_decay, + adam_beta1 = adam_beta1, + adam_beta2 = adam_beta2, + adam_epsilon = adam_epsilon, + max_grad_norm = max_grad_norm, + num_train_epochs = num_train_epochs, + max_steps = max_steps, + lr_scheduler_type = lr_scheduler_type, + warmup_ratio = warmup_ratio, + warmup_steps = warmup_steps, + log_level = log_level, + log_level_replica = log_level_replica, + log_on_each_node = log_on_each_node, + logging_dir = logging_dir, + logging_strategy = logging_strategy, + logging_first_step = logging_first_step, + logging_steps = logging_steps, + logging_nan_inf_filter = logging_nan_inf_filter, + save_strategy = save_strategy, + save_steps = save_steps, + save_total_limit = save_total_limit, + save_safetensors = save_safetensors, + save_on_each_node = save_on_each_node, + save_only_model = save_only_model, + restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint, + no_cuda = no_cuda, + use_cpu = use_cpu, + use_mps_device = use_mps_device, + seed = seed, + data_seed = data_seed, + jit_mode_eval = jit_mode_eval, + use_ipex = use_ipex, + bf16 = bf16, + fp16 = fp16, + fp16_opt_level = fp16_opt_level, + half_precision_backend = half_precision_backend, + bf16_full_eval = bf16_full_eval, + fp16_full_eval = fp16_full_eval, + tf32 = tf32, + local_rank = local_rank, + ddp_backend = ddp_backend, + tpu_num_cores = tpu_num_cores, + tpu_metrics_debug = tpu_metrics_debug, + debug = debug, + dataloader_drop_last = dataloader_drop_last, + eval_steps = eval_steps, + dataloader_num_workers = dataloader_num_workers, + dataloader_prefetch_factor = dataloader_prefetch_factor, + past_index = past_index, + run_name = run_name, + disable_tqdm = disable_tqdm, + remove_unused_columns = remove_unused_columns, + label_names = label_names, + load_best_model_at_end = load_best_model_at_end, + metric_for_best_model = metric_for_best_model, + greater_is_better = greater_is_better, + ignore_data_skip = ignore_data_skip, + fsdp = fsdp, + fsdp_min_num_params = fsdp_min_num_params, + fsdp_config = fsdp_config, + fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap, + accelerator_config = accelerator_config, + deepspeed = deepspeed, + label_smoothing_factor = label_smoothing_factor, + optim = optim, + optim_args = optim_args, + adafactor = adafactor, + group_by_length = group_by_length, + length_column_name = length_column_name, + report_to = report_to, + ddp_find_unused_parameters = ddp_find_unused_parameters, + ddp_bucket_cap_mb = ddp_bucket_cap_mb, + ddp_broadcast_buffers = ddp_broadcast_buffers, + dataloader_pin_memory = dataloader_pin_memory, + dataloader_persistent_workers = dataloader_persistent_workers, + skip_memory_metrics = skip_memory_metrics, + use_legacy_prediction_loop = use_legacy_prediction_loop, + push_to_hub = push_to_hub, + resume_from_checkpoint = resume_from_checkpoint, + hub_model_id = hub_model_id, + hub_strategy = hub_strategy, + hub_token = hub_token, + hub_private_repo = hub_private_repo, + hub_always_push = hub_always_push, + hub_revision = hub_revision, + gradient_checkpointing = gradient_checkpointing, + gradient_checkpointing_kwargs = gradient_checkpointing_kwargs, + include_inputs_for_metrics = include_inputs_for_metrics, + eval_do_concat_batches = eval_do_concat_batches, + fp16_backend = fp16_backend, + push_to_hub_model_id = push_to_hub_model_id, + push_to_hub_organization = push_to_hub_organization, + push_to_hub_token = push_to_hub_token, + mp_parameters = mp_parameters, + auto_find_batch_size = auto_find_batch_size, + full_determinism = full_determinism, + torchdynamo = torchdynamo, + ray_scope = ray_scope, + ddp_timeout = ddp_timeout, + torch_compile = torch_compile, + torch_compile_backend = torch_compile_backend, + torch_compile_mode = torch_compile_mode, + include_tokens_per_second = include_tokens_per_second, + include_num_input_tokens_seen = include_num_input_tokens_seen, + neftune_noise_alpha = neftune_noise_alpha, + optim_target_modules = optim_target_modules, + batch_eval_metrics = batch_eval_metrics, + eval_on_start = eval_on_start, + use_liger_kernel = use_liger_kernel, + liger_kernel_config = liger_kernel_config, + eval_use_gather_object = eval_use_gather_object, + average_tokens_across_devices = average_tokens_across_devices, + model_init_kwargs = model_init_kwargs, + chat_template_path = chat_template_path, + dataset_text_field = dataset_text_field, + dataset_kwargs = dataset_kwargs, + dataset_num_proc = dataset_num_proc, + eos_token = eos_token, + pad_token = pad_token, + max_length = max_length, + packing = packing, + packing_strategy = packing_strategy, + padding_free = padding_free, + pad_to_multiple_of = pad_to_multiple_of, + eval_packing = eval_packing, + completion_only_loss = completion_only_loss, + assistant_only_loss = assistant_only_loss, + activation_offloading = activation_offloading, + max_seq_length = max_seq_length, + temperature = temperature, + lmbda = lmbda, + beta = beta, + max_new_tokens = max_new_tokens, + teacher_model_name_or_path = teacher_model_name_or_path, + teacher_model_init_kwargs = teacher_model_init_kwargs, + disable_dropout = disable_dropout, + seq_kd = seq_kd,**kwargs) + self.vllm_sampling_params = vllm_sampling_params + self.unsloth_num_chunks = unsloth_num_chunks +pass + +class _UnslothGKDTrainer(SFTTrainer): + _tag_names = ["trl", "gkd"] + + def __init__( + self, + model: Optional[Union[PreTrainedModel, nn.Module, str]] = None, + teacher_model: Union[PreTrainedModel, nn.Module, str] = None, + args: Optional[GKDConfig] = None, + data_collator: Optional[DataCollator] = None, # type: ignore + train_dataset: Optional[Dataset] = None, + eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None, + processing_class: Optional[ + Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin] + ] = None, + compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None, + callbacks: Optional[list[TrainerCallback]] = None, + optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), + preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, + peft_config: Optional["PeftConfig"] = None, + formatting_func: Optional[Callable] = None, + ): + # add remove_unused_columns=False to the dataclass args + args.remove_unused_columns = False + data_collator = DataCollatorForChatML(tokenizer=processing_class, max_length=args.max_length) + + super().__init__( + model, + args=args, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + processing_class=processing_class, + compute_metrics=compute_metrics, + callbacks=callbacks, + optimizers=optimizers, + preprocess_logits_for_metrics=preprocess_logits_for_metrics, + peft_config=peft_config, + formatting_func=formatting_func, + ) + + if args.teacher_model_init_kwargs is None: + teacher_model_init_kwargs = {} + elif not isinstance(teacher_model, str): + raise ValueError( + "You passed teacher_model_init_kwargs to the GKDConfig, but your teacher_model is already instantiated." + ) + else: + teacher_model_init_kwargs = args.teacher_model_init_kwargs + teacher_model_init_kwargs["torch_dtype"] = ( + teacher_model_init_kwargs["torch_dtype"] + if teacher_model_init_kwargs["torch_dtype"] in ["auto", None] + else getattr(torch, teacher_model_init_kwargs["torch_dtype"]) + ) + + if isinstance(teacher_model, str): + teacher_model = AutoModelForCausalLM.from_pretrained(teacher_model, **teacher_model_init_kwargs) + + # Disable dropout in the model + if args.disable_dropout: + disable_dropout_in_model(self.model) + + if self.is_deepspeed_enabled: + self.teacher_model = prepare_deepspeed(teacher_model, self.accelerator) + else: + self.teacher_model = self.accelerator.prepare_model(teacher_model, evaluation_mode=True) + + self.lmbda = args.lmbda + self.beta = args.beta + self.temperature = args.temperature + self.seq_kd = args.seq_kd + + self.generation_config = GenerationConfig( + max_new_tokens=args.max_new_tokens, + temperature=args.temperature, + do_sample=True, + top_k=0, + use_cache=False if args.gradient_checkpointing else True, + pad_token_id=self.processing_class.pad_token_id, + ) + # Set custom EOS tokens if they are specified by the model's generation + # config. This is important for models with the Llama 3 chat template, + # which use special tokens <|eot_id|> and <|eom_id|> to mark the end of + # turns or messages. + if ( + hasattr(self.model.generation_config, "eos_token_id") + and self.model.generation_config.eos_token_id is not None + ): + self.generation_config.eos_token_id = self.model.generation_config.eos_token_id + + @staticmethod + def generalized_jsd_loss( + student_logits, teacher_logits, labels=None, beta=0.5, temperature=1.0, reduction="batchmean" + ): + """ + Compute the generalized Jensen-Shannon Divergence loss for knowledge distillation using F.kl_div. See Eq. (1) + of https://huggingface.co/papers/2306.13649 for the definition. + + Args: + student_logits: + Tensor of shape (batch_size, sequence_length, vocab_size) + teacher_logits: + Tensor of shape (batch_size, sequence_length, vocab_size) + labels: + Tensor of shape (batch_size, sequence_length) with -100 for padding tokens to ignore when computing + loss + beta: + Interpolation coefficient between 0 and 1 (default: 0.5) + temperature: + Softmax temperature (default: 1.0) + reduction: + Specifies the reduction to apply to the output (default: 'batchmean') + + Returns: + loss: Scalar tensor with the generalized JSD loss + """ + + # Apply temperature scaling + student_logits = student_logits / temperature + teacher_logits = teacher_logits / temperature + + # Compute log probabilities for student and probabilities for teacher + student_log_probs = F.log_softmax(student_logits, dim=-1) + teacher_log_probs = F.log_softmax(teacher_logits, dim=-1) + + if beta == 0: + jsd = F.kl_div(student_log_probs, teacher_log_probs, reduction="none", log_target=True) + elif beta == 1: + jsd = F.kl_div(teacher_log_probs, student_log_probs, reduction="none", log_target=True) + else: + # Compute the log of the mixture distribution + # log(a + b) = log(exp(log(a)) + exp(log(b))) -> for mixture + beta = torch.tensor(beta, dtype=student_log_probs.dtype) + mixture_log_probs = torch.logsumexp( + torch.stack([student_log_probs + torch.log(1 - beta), teacher_log_probs + torch.log(beta)]), + dim=0, + ) + + # Compute KL divergences using F.kl_div + # PyTorch differs from the standard mathematical definition, so the order of the probability distributions is swapped compared to that defined in the paper. + kl_teacher = F.kl_div(mixture_log_probs, teacher_log_probs, reduction="none", log_target=True) + kl_student = F.kl_div(mixture_log_probs, student_log_probs, reduction="none", log_target=True) + + # Compute the Generalized Jensen-Shannon Divergence + jsd = beta * kl_teacher + (1 - beta) * kl_student + + # Masking + if labels is not None: + mask = labels != -100 + jsd = jsd[mask] + + # Apply reduction + if reduction == "batchmean": + return jsd.sum() / mask.sum() if labels is not None else jsd.sum() / (jsd.size(0) * jsd.size(1)) + elif reduction == "sum": + return jsd.sum() + elif reduction == "mean": + return jsd.mean() + else: + return jsd + + def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None): + # compute student output + outputs_student = model( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + ) + + # compute teacher output in eval mode + self.teacher_model.eval() + with torch.no_grad(): + outputs_teacher = self.teacher_model( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + ) + + # slice the logits for the generated tokens using the inputs["prompts"] lengths + prompt_lengths = inputs["prompts"].shape[1] + shifted_student_logits = outputs_student.logits[:, prompt_lengths - 1 : -1, :] + shifted_teacher_logits = outputs_teacher.logits[:, prompt_lengths - 1 : -1, :] + shifted_labels = inputs["labels"][:, prompt_lengths:] + + # compute loss + loss = self.generalized_jsd_loss( + student_logits=shifted_student_logits, + teacher_logits=shifted_teacher_logits, + labels=shifted_labels, + beta=self.beta, + ) + + # empty cache + empty_cache() + + # Return loss + return (loss, outputs_student) if return_outputs else loss + + @staticmethod + def generate_on_policy_outputs(model, inputs, generation_config, pad_token_id=None): + # Generate output with respect to the prompt only + generated_outputs = model.generate( + input_ids=inputs["prompts"], + attention_mask=inputs.get("prompt_attention_mask", None), + generation_config=generation_config, + return_dict_in_generate=True, + ) + + # Get the generated token IDs + generated_tokens = generated_outputs.sequences + # Calculate new attention mask + new_attention_mask = torch.ones_like(generated_tokens) + new_labels = generated_tokens.clone() + + # If there's pad_token_id, set attention mask to 0 for padding tokens + if pad_token_id is not None: + new_labels[new_labels == pad_token_id] = -100 + new_attention_mask[generated_tokens == pad_token_id] = 0 + + return generated_tokens, new_attention_mask, new_labels + + def training_step( + self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], num_items_in_batch: Optional[int] = None + ) -> torch.Tensor: + """ + Perform a training step for the Generalized Knowledge Distillation (GKD) model. + + This method implements the on-policy learning approach described in the GKD paper. With probability + `self.lmbda`, it generates new responses using the student model, which are then used for training instead of + the original inputs. + """ + if self.seq_kd: + with unwrap_model_for_generation(self.teacher_model, self.accelerator) as unwrapped_model: + new_input_ids, new_attention_mask, new_labels = self.generate_on_policy_outputs( + unwrapped_model, inputs, self.generation_config, self.processing_class.pad_token_id + ) + inputs["input_ids"] = new_input_ids + inputs["attention_mask"] = new_attention_mask + inputs["labels"] = new_labels + if random.random() <= self.lmbda: + with unwrap_model_for_generation(model, self.accelerator) as unwrapped_model: + new_input_ids, new_attention_mask, new_labels = self.generate_on_policy_outputs( + unwrapped_model, inputs, self.generation_config, self.processing_class.pad_token_id + ) + inputs["input_ids"] = new_input_ids + inputs["attention_mask"] = new_attention_mask + inputs["labels"] = new_labels + + loss = super().training_step(model, inputs, num_items_in_batch) + return loss + + def create_model_card( + self, + model_name: Optional[str] = None, + dataset_name: Optional[str] = None, + tags: Union[str, list[str], None] = None, + ): + """ + Creates a draft of a model card using the information available to the `Trainer`. + + Args: + model_name (`str` or `None`, *optional*, defaults to `None`): + Name of the model. + dataset_name (`str` or `None`, *optional*, defaults to `None`): + Name of the dataset used for training. + tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`): + Tags to be associated with the model card. + """ + if not self.is_world_process_zero(): + return + + if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path): + base_model = self.model.config._name_or_path + else: + base_model = None + + # normalize `tags` to a mutable set + if tags is None: + tags = set() + elif isinstance(tags, str): + tags = {tags} + else: + tags = set(tags) + + if hasattr(self.model.config, "unsloth_version"): + tags.add("unsloth") + + tags.update(self._tag_names) + + citation = textwrap.dedent("""\ + @inproceedings{agarwal2024on-policy, + title = {{On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes}}, + author = {Rishabh Agarwal and Nino Vieillard and Yongchao Zhou and Piotr Stanczyk and Sabela Ramos Garea and Matthieu Geist and Olivier Bachem}, + year = 2024, + booktitle = {The Twelfth International Conference on Learning Representations, {ICLR} 2024, Vienna, Austria, May 7-11, 2024}, + publisher = {OpenReview.net}, + url = {https://openreview.net/forum?id=3zKtaqxLhW}, + }""") + + model_card = generate_model_card( + base_model=base_model, + model_name=model_name, + hub_model_id=self.hub_model_id, + dataset_name=dataset_name, + tags=tags, + wandb_url=wandb.run.get_url() if is_wandb_available() and wandb.run is not None else None, + comet_url=get_comet_experiment_url(), + trainer_name="GKD", + trainer_citation=citation, + paper_title="On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes", + paper_id="2306.13649", + ) + + model_card.save(os.path.join(self.args.output_dir, "README.md")) +class UnslothGKDTrainer(_UnslothGKDTrainer): + """ + + """ + def __init__( + self, + model = None, + teacher_model = None, + args = None, + data_collator = None, + train_dataset = None, + eval_dataset = None, + processing_class = None, + compute_metrics = None, + callbacks = None, + preprocess_logits_for_metrics = None, + peft_config = None, + formatting_func = None, + **kwargs + ): + if args is None: args = UnslothGKDConfig() + use_bf16 = getattr(args, 'bf16', False) + if type(use_bf16) is not bool: use_bf16 = False + use_fp16 = getattr(args, 'fp16', False) + if type(use_fp16) is not bool: use_fp16 = False + force_float32 = False + if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1': + print('Unsloth: Switching to float32 training since model cannot work with float16') + force_float32 = True + mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') + dtype = getattr(model.config, 'torch_dtype', None) + if dtype is None: dtype = model.get_input_embeddings().dtype + from unsloth_zoo.utils import _get_dtype + dtype = _get_dtype(dtype) + float16 = dtype == torch.float16 + if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`') + if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`') + if force_float32: + args.fp16 = False + args.bf16 = False + os.environ['ACCELERATE_MIXED_PRECISION'] = 'no' + elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32': + args.fp16 = float16 + args.bf16 = not float16 + os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16' + if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no': + args.eval_strategy = 'steps' + if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1 + ga_steps = getattr(args, 'gradient_accumulation_steps', None) + if ga_steps is not None and ga_steps > 1: + from transformers import __version__ as transformers_version + if Version(transformers_version) <= Version('4.45.2'): + print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n' + '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`') + if getattr(args, 'eval_strategy', 'no') != 'no': + eval_bsz = getattr(args, 'per_device_eval_batch_size', 8) + if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size + if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps + fp16_full_eval = getattr(args, 'fp16_full_eval', False) + if type(fp16_full_eval) is not bool: fp16_full_eval = False + bf16_full_eval = getattr(args, 'bf16_full_eval', False) + if type(bf16_full_eval) is not bool: bf16_full_eval = False + if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True + if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False + if force_float32: + args.bf16_full_eval = False + args.fp16_full_eval = False + elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16': + args.bf16_full_eval = True + args.fp16_full_eval = False + elif not bf16_full_eval and not fp16_full_eval: + args.bf16_full_eval = args.bf16 + args.fp16_full_eval = args.fp16 + _output_logits = False + if locals().get('compute_metrics', None) is not None: _output_logits = True + if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True + if _output_logits: + os.environ['UNSLOTH_RETURN_LOGITS'] = '1' + if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'): + pass + else: + model_max_seq_length = getattr(model, 'max_seq_length', None) + args_max_seq_length = getattr(args, 'max_seq_length', None) + if args_max_seq_length is None and model_max_seq_length is not None: + max_seq_length = model.max_seq_length + if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length + if model is not None and hasattr(model, 'for_training'): + model.for_training() + if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right' + if 'processing_class' in locals(): + if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right' + if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right' + __tokenizer = processing_class if 'processing_class' in locals() else tokenizer + from unsloth_zoo.vision_utils import UnslothVisionDataCollator + if not isinstance(data_collator, UnslothVisionDataCollator): + if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer, mlm = False, mlm_probability = 0.0) + elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names: + data_collator = DataCollatorForSeq2Seq(__tokenizer) + else: + if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False + if hasattr(args, 'dataset_text_field'): args.dataset_text_field = '' + if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True} + if not isinstance(data_collator, UnslothVisionDataCollator): + if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'): + if isinstance(data_collator, DataCollatorForSeq2Seq): + data_collator = DataCollatorForSeq2Seq(__tokenizer.tokenizer) + else: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer.tokenizer, mlm = False, mlm_probability = 0.0) + other_metrics = [] + + from unsloth_zoo.logging_utils import PatchRLStatistics + PatchRLStatistics('gkd_trainer', other_metrics) + + super().__init__( + model = model, + teacher_model = teacher_model, + args = args, + data_collator = data_collator, + train_dataset = train_dataset, + eval_dataset = eval_dataset, + processing_class = processing_class, + compute_metrics = compute_metrics, + callbacks = callbacks, + preprocess_logits_for_metrics = preprocess_logits_for_metrics, + peft_config = peft_config, + formatting_func = formatting_func,**kwargs) + if hasattr(self, 'neftune_hook_handle'): + self.neftune_hook_handle.remove() + if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle + if getattr(args, 'neftune_noise_alpha', None) is not None: + model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha + pass + +pass diff --git a/unsloth_compiled_cache/UnslothGRPOTrainer.py b/unsloth_compiled_cache/UnslothGRPOTrainer.py new file mode 100644 index 0000000000000000000000000000000000000000..108e9b6c6c2c1f4b3dac19e724e55a45dd3cb757 --- /dev/null +++ b/unsloth_compiled_cache/UnslothGRPOTrainer.py @@ -0,0 +1,2503 @@ +""" +2025.7.4 +2025.7.3 +4.53.2 +0.19.1 +__UNSLOTH_VERSIONING__ +""" +from torch import Tensor +import torch +import torch.nn as nn +from torch.nn import functional as F +from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable +from trl.trainer.grpo_trainer import (Any, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, DataLoader, Dataset, FSDP, GRPOConfig, GRPOTrainer, GenerationConfig, IterableDataset, Optional, Path, PeftConfig, PreTrainedModel, PreTrainedTokenizerBase, RepeatSampler, RewardFunc, Sampler, SyncRefModelCallback, Trainer, TrainerCallback, Union, VLLMClient, _ForwardRedirection, apply_chat_template, broadcast_object_list, datasets, defaultdict, deque, disable_dropout_in_model, gather, gather_object, generate_model_card, get_comet_experiment_url, identity, is_conversational, is_datasets_available, is_liger_kernel_available, is_peft_available, is_peft_model, is_rich_available, is_vllm_available, is_wandb_available, maybe_apply_chat_template, nanmax, nanmin, nanstd, nn, nullcontext, os, pad, partial, prepare_deepspeed, prepare_fsdp, print_prompt_completions_sample, profiling_context, profiling_decorator, re, seed_worker, set_seed, shuffle_tensor_dict, split_tensor_dict, textwrap, torch, transformers, unwrap_model_for_generation, version, wandb, warnings, Any, FSDP, Union, apply_chat_template, broadcast_object_list, gather, gather_object, is_conversational, maybe_apply_chat_template, nanstd, nullcontext, os, pad, profiling_context, re, torch, unwrap_model_for_generation, os, re, torch, transformers, re, Any, Union, os, profiling_decorator, re, shuffle_tensor_dict, split_tensor_dict, torch, Optional, PreTrainedModel, Trainer, is_peft_available, os, re, torch, FSDP, nn, os, re, GRPOTrainer, Trainer, gather, os, re, torch) + + +import os +from typing import * +from dataclasses import dataclass, field +from packaging.version import Version +import torch +import numpy as np +from contextlib import nullcontext +from torch.nn import functional as F +from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling + +torch_compile_options = { + "epilogue_fusion" : True, + "max_autotune" : False, + "shape_padding" : True, + "trace.enabled" : False, + "triton.cudagraphs" : False, +} + +@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) +def selective_log_softmax(logits, index): + logits = logits.to(torch.float32) + selected_logits = torch.gather(logits, dim = -1, index = index.unsqueeze(-1)).squeeze(-1) + # loop to reduce peak mem consumption + # logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits]) + logsumexp_values = torch.logsumexp(logits, dim = -1) + per_token_logps = selected_logits - logsumexp_values # log_softmax(x_i) = x_i - logsumexp(x) + return per_token_logps + +def grpo_compute_loss( + ref_logits, + new_logits, + old_logits, + input_ids, + mask, + beta, + advantages, + **kwargs +): + # All Unsloth Zoo code licensed under LGPLv3 + # Set defaults for optional arguments + loss_type = kwargs.get("loss_type", "grpo") + epsilon_low = kwargs.get("epsilon_low", 0.2) + epsilon_high = kwargs.get("epsilon_high", 0.2) + max_completion_length = kwargs.get("max_completion_length", 8192) + delta = kwargs.get("delta", None) + temperature = kwargs.get("temperature", 1.0) + logit_scale_multiply = kwargs.get("logit_scale_multiply", 0.0) + logit_scale_divide = kwargs.get("logit_scale_divide", 0.0) + logit_softcapping = kwargs.get("logit_softcapping", 0.0) + + input_ids = input_ids.unsqueeze(-1) + + # Optional logit softcapping and logit dividing + if logit_scale_multiply != 0: new_logits = new_logits * logit_scale_multiply + if logit_scale_divide != 0: new_logits = new_logits / logit_scale_divide + if logit_softcapping != 0: new_logits = new_logits * torch.tanh(new_logits / logit_softcapping) + + new_logits = new_logits.to(torch.float32) + # See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details + if temperature != 1.0: new_logits = new_logits / temperature + new_x = torch.gather(new_logits, dim = -1, index = input_ids).squeeze(-1) + new = new_x - torch.logsumexp(new_logits, dim = -1) + + # x_i - logsumexp(x_i) + with torch.no_grad(): + if beta != 0.0: + assert ref_logits is not None, "ref_logits should not be None when beta != 0.0" + + # Optional logit softcapping and logit dividing + if logit_scale_multiply != 0: ref_logits = ref_logits * logit_scale_multiply + if logit_scale_divide != 0: ref_logits = ref_logits / logit_scale_divide + if logit_softcapping != 0: ref_logits = ref_logits * torch.tanh(ref_logits / logit_softcapping) + + ref_logits = ref_logits.to(torch.float32) + # See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details + if temperature != 1.0: ref_logits = ref_logits / temperature + ref_x = torch.gather(ref_logits, dim = -1, index = input_ids).squeeze(-1) + ref = ref_x - torch.logsumexp(ref_logits, dim = -1) + pass + + if old_logits is not None: + # Optional logit softcapping and logit dividing + if logit_scale_multiply != 0: old_logits = old_logits * logit_scale_multiply + if logit_scale_divide != 0: old_logits = old_logits / logit_scale_divide + if logit_softcapping != 0: old_logits = old_logits * torch.tanh(old_logits / logit_softcapping) + + old_logits = old_logits.to(torch.float32) + # See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details + if temperature != 1.0: old_logits = old_logits / temperature + old_x = torch.gather(old_logits, dim = -1, index = input_ids).squeeze(-1) + old = old_x - torch.logsumexp(old_logits, dim = -1) + pass + pass + + # Reverse KL + # Note that this is a low variance low bias estimator for the KL divergence as used in GRPO paper + if beta != 0.0: + kl_i = torch.exp(ref - new) - (ref - new) - 1.0 + + else: + kl_i = 0.0 # set it to 0 to not effect the downstream computation + # Full correct reverse KL divergence?? Missing term maybe? + # kl_i = torch.exp(new) * kl_i + + # Below is forward KL (normal KL) + # kl_i = torch.exp(old) * (old - new) + if old_logits is not None: + coef_1 = torch.exp(new - old) + else: + coef_1 = torch.exp(new - new.detach()) + coef_2 = torch.clamp(coef_1, 1 - epsilon_low, 1 + epsilon_high) + + if delta is not None: + loss_1 = torch.clamp(coef_1, max=delta) * advantages.unsqueeze(1) + else: + loss_1 = coef_1 * advantages.unsqueeze(1) + pass + + # Must detach - otherwise gradients are not propagated correctly! + # exp(x - x) == 1 + # loss_i = torch.exp(new - new.detach()) * advantages.unsqueeze(1) + + loss_2 = coef_2 * advantages.unsqueeze(1) + loss_i = -torch.min(loss_1, loss_2) + if beta != 0.0: + loss_i = loss_i + beta * kl_i + + mask = mask.to(torch.float32) + n_mask_per_reward = mask.sum(1) + + # https://github.com/huggingface/trl/blob/main/trl/trainer/grpo_trainer.py#L1363-L1370 + if loss_type == "grpo": + loss = ((loss_i * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean() + elif loss_type == "bnpo": + loss = (loss_i * mask).sum() / mask.sum().clamp(min=1.0) + elif loss_type == "dr_grpo": + loss = (loss_i * mask).sum() / (loss_i.size(0) * max_completion_length) + else: + raise ValueError(f"Unknown loss type: {loss_type}") + + # loss = (loss_i * mask).sum() / mask.sum() + + # Get metrics as well which are folded + with torch.inference_mode(): + completion_length = n_mask_per_reward.mean() + mean_kl_per_reward = (kl_i * mask).sum(1) / n_mask_per_reward + mean_kl = mean_kl_per_reward.mean() + pass + + return loss, completion_length, mean_kl + +class UnslothEfficientGRPO(torch.autograd.Function): + # All Unsloth Zoo code licensed under LGPLv3 + @staticmethod + def forward(ctx, _new_hidden_states, _old_hidden_states, _ref_hidden_states, lm_head, _input_ids, _mask, _advantages, beta, scaler = None, n_chunks = 1, extra_kwargs=None): + if extra_kwargs is None: + extra_kwargs = {} + def compute_loss(new_hidden_states, old_hidden_states, ref_hidden_states, input_ids, mask, advantages, scaling): + new_logits = torch.matmul(new_hidden_states, lm_head.t()) + new_logits = new_logits[:, :-1, :] # exclude the last logit: it corresponds to the next token pred + with torch.no_grad(): + if beta != 0.0: + ref_logits = torch.matmul(ref_hidden_states, lm_head.t()) + ref_logits = ref_logits[:, :-1, :] # exclude the last logit: it corresponds to the next token pred + else: + ref_logits = None + if old_hidden_states is not None: + old_logits = torch.matmul(old_hidden_states, lm_head.t()) + old_logits = old_logits[:, :-1, :] # exclude the last logit: it corresponds to the next token pred + else: + old_logits = None + # if old_hidden_states is not None: + # old_logits = torch.matmul(old_hidden_states, lm_head.t()) #last logit already excluded + # old_logits = old_logits[:, :-1, :] # exclude the last logit: it corresponds to the next token pred + # else: + # old_logits = None + # unsloth_zoo/rl_replacements.py + loss, completion_length, mean_kl = grpo_compute_loss( + ref_logits, + new_logits, + old_logits, + input_ids, + mask, + beta, + advantages, + **extra_kwargs, + ) + + # Scale loss if needed for mixed precision training + scaled_loss = loss * scaling + # Must add .loss.detach otherwise autograd uses 2x VRAM + return scaled_loss, (loss.detach(), completion_length, mean_kl,) + pass + + device =_new_hidden_states.device + grad_inputs = torch.empty_like(_new_hidden_states) + accumulated_loss = torch.zeros(1, device = device) + accumulated_completion_length = torch.zeros(1, device = device) + accumulated_mean_kl = torch.zeros(1, device = device) + + def accumulate_chunk(new_hidden_states_j, old_hidden_states_j, ref_hidden_states_j, input_ids_j, mask_j, advantages_j, scaling): + (chunk_grad_input,), (chunk_loss, (unscaled_loss, chunk_completion_length, chunk_mean_kl,)) = torch.func.grad_and_value( + compute_loss, + argnums = (0,), + has_aux = True, + )(new_hidden_states_j, old_hidden_states_j, ref_hidden_states_j, input_ids_j, mask_j, advantages_j, scaling) + accumulated_loss .add_(unscaled_loss) + accumulated_completion_length.add_(chunk_completion_length) + accumulated_mean_kl .add_(chunk_mean_kl) + return chunk_grad_input + pass + + accumulate_chunk = torch.compile( + accumulate_chunk, + fullgraph = True, + options = torch_compile_options, + ) + + grad_inputs_chunks = torch.chunk(grad_inputs, chunks = n_chunks, dim = 0) + new_hidden_states = torch.chunk(_new_hidden_states, chunks = n_chunks, dim = 0) + if _old_hidden_states is not None: + old_hidden_states = torch.chunk(_old_hidden_states, chunks = n_chunks, dim = 0) + else: + old_hidden_states = [None] * n_chunks + ref_hidden_states = torch.chunk(_ref_hidden_states, chunks = n_chunks, dim = 0) + input_ids = torch.chunk(_input_ids, chunks = n_chunks, dim = 0) + mask = torch.chunk(_mask, chunks = n_chunks, dim = 0) + advantages = torch.chunk(_advantages, chunks = n_chunks, dim = 0) + + # Get mixed precision scaling if seen + scaling = scaler.get_scale() if scaler is not None else 1.0 + + # Force torch.compile to use dynamic shapes for seqlen dim + mark_dynamic = lambda x: torch._dynamo.mark_dynamic(x, 1) + + for (grad_inputs_j, new_hidden_states_j, old_hidden_states_j, ref_hidden_states_j, input_ids_j, mask_j, advantages_j,) in \ + zip(grad_inputs_chunks, new_hidden_states, old_hidden_states, ref_hidden_states, input_ids, mask, advantages): + + mark_dynamic(new_hidden_states_j) + mark_dynamic(ref_hidden_states_j) + if old_hidden_states_j is not None: + mark_dynamic(old_hidden_states_j) + mark_dynamic(input_ids_j) + mark_dynamic(mask_j) + + + grad_inputs_j.copy_(accumulate_chunk(new_hidden_states_j, old_hidden_states_j,ref_hidden_states_j, input_ids_j, mask_j, advantages_j, scaling)) + pass + + grad_inputs .div_(n_chunks) + accumulated_loss .div_(n_chunks) + accumulated_completion_length.div_(n_chunks) + accumulated_mean_kl .div_(n_chunks) + ctx.save_for_backward(grad_inputs) + return ( + accumulated_loss, + accumulated_completion_length, + accumulated_mean_kl, + ) + pass + + @staticmethod + def backward(ctx, grad_output, dcompletion_length, dmean_kl): + (grad_input,) = ctx.saved_tensors + return (grad_input, None, None, None, None, None, None, None, None, None, None) + pass + +def grpo_accumulated_loss( + trainer, + input_ids, + attention_mask, + logits_to_keep, + completion_mask, + advantages, + old_hidden_states, + n_chunks = -1, + **kwargs, +): + # All Unsloth Zoo code licensed under LGPLv3 + bsz, qlen = input_ids.shape + + # Find closest multiple + factors = [i for i in range(1, bsz + 1) if bsz % i == 0] + if n_chunks == -1: n_chunks = bsz + n_chunks = factors[min(np.searchsorted(factors, n_chunks), len(factors)-1)] + + if not hasattr(trainer, '_autocast_dtype'): + trainer._autocast_dtype = torch.float16 if os.environ.get('ACCELERATE_MIXED_PRECISION', 'fp16') == 'fp16' else torch.bfloat16 + if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1': trainer._autocast_dtype = torch.float16 + pass + os.environ["UNSLOTH_RETURN_HIDDEN_STATES"] = "1" + + completion_input_ids = input_ids[:, -logits_to_keep:] + lm_head = trainer.model.get_output_embeddings().weight + + with torch.amp.autocast(device_type = trainer.model.device.type, dtype = trainer._autocast_dtype): + with torch.inference_mode(), trainer.accelerator.unwrap_model(trainer.model, keep_fp32_wrapper = False).disable_adapter(): + ref_hidden_states = trainer.model( + input_ids = input_ids, + attention_mask = attention_mask, + logits_to_keep = logits_to_keep + 1, + ).logits + pass + new_hidden_states = trainer.model( + input_ids = input_ids, + attention_mask = attention_mask, + logits_to_keep = logits_to_keep + 1, + ).logits + + loss, completion_length, mean_kl = UnslothEfficientGRPO.apply( + new_hidden_states, + old_hidden_states, + ref_hidden_states, + lm_head, + completion_input_ids, + completion_mask, + advantages, + trainer.beta, + trainer.accelerator.scaler, + n_chunks, + kwargs # pass kwargs as a dict + ) + pass + # Must force not returning hidden states but logits otherwise gibberish + os.environ["UNSLOTH_RETURN_HIDDEN_STATES"] = "0" + return loss, completion_length, mean_kl + + # Old non efficient code path + new_logits = torch.matmul(new_hidden_states, lm_head.t()) + new_logits = new_logits[:, :-1, :] # exclude the last logit: it corresponds to the next token pred + old_logits = torch.matmul(old_hidden_states, lm_head.t()) + old_logits = old_logits[:, :-1, :] # exclude the last logit: it corresponds to the next token pred + loss, completion_length, mean_kl = grpo_compute_loss( + old_logits, + new_logits, + completion_input_ids, + completion_mask, + trainer.beta, + advantages, + ) + return loss, completion_length, mean_kl + pass + +@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options) +def grpo_compute_loss_slow( + ref_logits, + new_logits, + old_logits, + input_ids, + mask, + beta, + advantages, + **kwargs +): + # All Unsloth Zoo code licensed under LGPLv3 + # Set defaults for optional arguments + loss_type = kwargs.get("loss_type", "grpo") + epsilon_low = kwargs.get("epsilon_low", 0.2) + epsilon_high = kwargs.get("epsilon_high", 0.2) + max_completion_length = kwargs.get("max_completion_length", 8192) + delta = kwargs.get("delta", None) + temperature = kwargs.get("temperature", 1.0) + logit_scale_multiply = kwargs.get("logit_scale_multiply", 0.0) + logit_scale_divide = kwargs.get("logit_scale_divide", 0.0) + logit_softcapping = kwargs.get("logit_softcapping", 0.0) + + input_ids = input_ids.unsqueeze(-1) + + # Optional logit softcapping and logit dividing + if logit_scale_multiply != 0: new_logits = new_logits * logit_scale_multiply + if logit_scale_divide != 0: new_logits = new_logits / logit_scale_divide + if logit_softcapping != 0: new_logits = new_logits * torch.tanh(new_logits / logit_softcapping) + + new_logits = new_logits.to(torch.float32) + # See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details + if temperature != 1.0: new_logits = new_logits / temperature + new_x = torch.gather(new_logits, dim = -1, index = input_ids).squeeze(-1) + new = new_x - torch.logsumexp(new_logits, dim = -1) + + # x_i - logsumexp(x_i) + with torch.no_grad(): + if beta != 0.0: + assert ref_logits is not None, "ref_logits should not be None when beta != 0.0" + + # Optional logit softcapping and logit dividing + if logit_scale_multiply != 0: ref_logits = ref_logits * logit_scale_multiply + if logit_scale_divide != 0: ref_logits = ref_logits / logit_scale_divide + if logit_softcapping != 0: ref_logits = ref_logits * torch.tanh(ref_logits / logit_softcapping) + + ref_logits = ref_logits.to(torch.float32) + # See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details + if temperature != 1.0: ref_logits = ref_logits / temperature + ref_x = torch.gather(ref_logits, dim = -1, index = input_ids).squeeze(-1) + ref = ref_x - torch.logsumexp(ref_logits, dim = -1) + pass + + if old_logits is not None: + # Optional logit softcapping and logit dividing + if logit_scale_multiply != 0: old_logits = old_logits * logit_scale_multiply + if logit_scale_divide != 0: old_logits = old_logits / logit_scale_divide + if logit_softcapping != 0: old_logits = old_logits * torch.tanh(old_logits / logit_softcapping) + + old_logits = old_logits.to(torch.float32) + # See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details + if temperature != 1.0: old_logits = old_logits / temperature + old_x = torch.gather(old_logits, dim = -1, index = input_ids).squeeze(-1) + old = old_x - torch.logsumexp(old_logits, dim = -1) + pass + pass + + # Reverse KL + # Note that this is a low variance low bias estimator for the KL divergence as used in GRPO paper + if beta != 0.0: + kl_i = torch.exp(ref - new) - (ref - new) - 1.0 + + else: + kl_i = 0.0 # set it to 0 to not effect the downstream computation + # Full correct reverse KL divergence?? Missing term maybe? + # kl_i = torch.exp(new) * kl_i + + # Below is forward KL (normal KL) + # kl_i = torch.exp(old) * (old - new) + if old_logits is not None: + coef_1 = torch.exp(new - old) + else: + coef_1 = torch.exp(new - new.detach()) + coef_2 = torch.clamp(coef_1, 1 - epsilon_low, 1 + epsilon_high) + + if delta is not None: + loss_1 = torch.clamp(coef_1, max=delta) * advantages.unsqueeze(1) + else: + loss_1 = coef_1 * advantages.unsqueeze(1) + pass + + # Must detach - otherwise gradients are not propagated correctly! + # exp(x - x) == 1 + # loss_i = torch.exp(new - new.detach()) * advantages.unsqueeze(1) + + loss_2 = coef_2 * advantages.unsqueeze(1) + loss_i = -torch.min(loss_1, loss_2) + if beta != 0.0: + loss_i = loss_i + beta * kl_i + + mask = mask.to(torch.float32) + n_mask_per_reward = mask.sum(1) + + # https://github.com/huggingface/trl/blob/main/trl/trainer/grpo_trainer.py#L1363-L1370 + if loss_type == "grpo": + loss = ((loss_i * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean() + elif loss_type == "bnpo": + loss = (loss_i * mask).sum() / mask.sum().clamp(min=1.0) + elif loss_type == "dr_grpo": + loss = (loss_i * mask).sum() / (loss_i.size(0) * max_completion_length) + else: + raise ValueError(f"Unknown loss type: {loss_type}") + + # loss = (loss_i * mask).sum() / mask.sum() + + # Get metrics as well which are folded + with torch.inference_mode(): + completion_length = n_mask_per_reward.mean() + mean_kl_per_reward = (kl_i * mask).sum(1) / n_mask_per_reward + mean_kl = mean_kl_per_reward.mean() + pass + + return loss, completion_length, mean_kl + +def vLLMSamplingParams(**kwargs): + from vllm import SamplingParams + sampling_params = SamplingParams(**kwargs) + sampling_params._set_kwargs = kwargs + return sampling_params +@dataclass +class UnslothGRPOConfig(GRPOConfig): + """ + + Configuration class for the [`GRPOTrainer`]. + + This class includes only the parameters that are specific to GRPO training. For a full list of training arguments, + please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may + differ from those in [`~transformers.TrainingArguments`]. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + > Parameters that control the model and reference model + + model_init_kwargs (`str`, `dict[str, Any]` or `None`, *optional*, defaults to `None`): + Keyword arguments for [`~transformers.AutoModelForCausalLM.from_pretrained`], used when the `model` + argument of the [`GRPOTrainer`] is provided as a string. + disable_dropout (`bool`, *optional*, defaults to `False`): + Whether to disable dropout in the model. This is useful for training with a reference model, as it prevents + the model from generating different logprobs for the same input. + + > Parameters that control the data preprocessing + + remove_unused_columns (`bool`, *optional*, defaults to `False`): + Whether to only keep the column `"prompt"` in the dataset. If you use a custom reward function that + requires any column other than `"prompts"` and `"completions"`, you should keep this to `False`. + max_prompt_length (`int` or `None`, *optional*, defaults to `512`): + Maximum length of the prompt. If the prompt is longer than this value, it will be truncated left. + num_generations (`int` or `None`, *optional*, defaults to `8`): + Number of generations per prompt to sample. The effective batch size (num_processes * per_device_batch_size + * gradient_accumulation_steps) must be evenly divisible by this value. + max_completion_length (`int` or `None`, *optional*, defaults to `256`): + Maximum length of the generated completion. + ds3_gather_for_generation (`bool`, *optional*, defaults to `True`): + This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation, + improving generation speed. However, disabling this option allows training models that exceed the VRAM + capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible + with vLLM generation. + shuffle_dataset (`bool`, *optional*, defaults to `True`): + Whether to shuffle the training dataset. + + > Parameters that control generation + + generation_batch_size: (`int` or `None`, *optional*, defaults to `None`): + Batch size to use for generation. If `None`, it defaults to the effective training batch size: + `per_device_train_batch_size * num_processes * gradient_accumulation_steps`. + steps_per_generations: (`int` or `None`, *optional*, defaults to `None`): + Number of optimization steps per generation. If `None`, it defaults to gradient_accumulation_steps. + temperature (`float`, defaults to `1.0`): + Temperature for sampling. The higher the temperature, the more random the completions. + top_p (`float`, *optional*, defaults to `1.0`): + Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to + `1.0` to consider all tokens. + top_k (`int` or `None`, *optional*, defaults to `None`): + Number of highest probability vocabulary tokens to keep for top-k-filtering. If `None`, top-k-filtering is + disabled and all tokens are considered. + min_p (`float` or `None`, *optional*, defaults to `None`): + Minimum token probability, which will be scaled by the probability of the most likely token. It must be a + value between `0.0` and `1.0`. Typical values are in the `0.01-0.2` range. + repetition_penalty (`float`, *optional*, defaults to `1.0`): + Float that penalizes new tokens based on whether they appear in the prompt and the generated text so far. + Values > `1.0` encourage the model to use new tokens, while values < `1.0` encourage the model to repeat + tokens. + cache_implementation (`str` or `None`, *optional*, defaults to `None`): + Implementation of the cache method for faster generation when use_vllm is set to False. + generation_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): + Additional keyword arguments to pass to `GenerationConfig` (if using transformers) or `SamplingParams` (if + using vLLM) when sampling completions. This can be used to further customize the generation behavior, such + as setting `supress_tokens`, `num_beams`, etc. If it contains keys that conflict with the other generation + parameters (like `min_p`, `top_p`, etc.), they will override them. + + > Parameters that control generation acceleration powered by vLLM + + use_vllm (`bool`, *optional*, defaults to `False`): + Whether to use vLLM for generating completions. If set to `True`, the trainer will use vLLM for generation + instead of the default model.generate(). Requires `vllm` to be installed. + vllm_mode (`str`, *optional*, defaults to `"server"`): + Mode to use for vLLM integration when `use_vllm` is set to `True`. Must be one of `"server"` or + `"colocate"`. + + - `"server"`: The trainer will send generation requests to a separate vLLM server. Make sure a TRL vLLM + server is running (start with `trl vllm-serve`). + - `"colocate"`: vLLM will run in the same process and share the training GPUs. This avoids the need for a + separate server but may cause resource contention with training. + vllm_guided_decoding_regex (`str` or `None`, *optional*, defaults to `None`): + Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled. + + > Parameters that control the vLLM server (only used when `vllm_mode` is `"server"`) + vllm_server_base_url (`str` or `None`, *optional*, defaults to `None`): + Base URL for the vLLM server (e.g., `"http://localhost:8000"`). If provided, `vllm_server_host` and + `vllm_server_port` are ignored. + vllm_server_host (`str`, *optional*, defaults to `"0.0.0.0"`): + Host of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided. + vllm_server_port (`int`, *optional*, defaults to `8000`): + Port of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided. + vllm_server_timeout (`float`, *optional*, defaults to `240.0`): + Total timeout duration in seconds to wait for the vLLM server to be up. If the server is not up after the + timeout, a `ConnectionError` is raised. + + > Parameters that control colocated vLLM execution (only used when `vllm_mode` is `"colocate"`) + + vllm_gpu_memory_utilization (`float`, *optional*, defaults to `0.3`): + Control the GPU memory utilization for vLLM. This setting only applies when `vllm_mode` is set to + `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when + launching the vLLM server via the `--vllm_gpu_memory_utilization` flag. + vllm_tensor_parallel_size (`int`, *optional*, defaults to `1`): + Control the tensor parallel size for vLLM. This setting only applies when `vllm_mode` is set to + `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when + launching the vLLM server via the `--vllm_tensor_parallel_size` flag. + + > Parameters that control the training + + beta (`float`, *optional*, defaults to `0.0`): + KL coefficient. If `0.0` (default), the reference model is not loaded, reducing memory usage and improving + training speed. + num_iterations (`int`, *optional*, defaults to `1`): + Number of iterations per batch (denoted as μ in the algorithm). + epsilon (`float`, *optional*, defaults to `0.2`): + Epsilon value for clipping. + delta: (`float` or `None`, *optional*, defaults to `None`): + Enables the upper clipping bound in two-sided GRPO loss when set to a float. If `None` (default), standard + GRPO clipping is used. Recommended to be greater than `1 + ε` when enabled. This method is introduced in + the [INTELLECT-2 tech report](https://huggingface.co/papers/2505.07291). + epsilon_high (`float` or `None`, *optional*, defaults to `None`): + Upper-bound epsilon value for clipping. If not specified, it defaults to the same value as the lower-bound + specified in argument `epsilon`. Paper [DAPO](https://huggingface.co/papers/2503.14476) recommends `0.28`. + reward_weights (`list[float]` or `None`, *optional*, defaults to `None`): + Weights for each reward function. Must match the number of reward functions. If `None`, all rewards are + weighted equally with weight `1.0`. + scale_rewards (`bool`, *optional*, defaults to `True`): + Whether to scale the rewards by dividing them by their standard deviation. If `True` (default), the rewards + are normalized by the standard deviation, ensuring they have unit variance. If `False`, no scaling is + applied. The [Dr. GRPO paper](https://huggingface.co/papers/2503.20783) recommends not scaling the rewards, + as scaling by the standard deviation introduces a question-level difficulty bias. + loss_type (`str`, *optional*, defaults to `"bnpo"`): + Specifies the loss formulation to use. Supported values are: + + - `"grpo"`: Aggregates token-level losses by normalizing over sequence length. Not recommended due to + length bias—this approach tends to prefer shorter completions with positive advantages and longer ones + with negative advantages. + - `"bnpo"`: Aggregates token-level losses by normalizing number of active token in the local batch. + Note that normalization is performed over the local batch only, so results may slightly vary depending + on the local batch size, despite a constant effective batch size. When using + `per_device_train_batch_size==1`, the loss is equivalent to the GRPO loss. + - `"dr_grpo"`: Aggregates token-level losses by normalizing with a global constant. This method was + introduced in the [Dr. GRPO paper](https://huggingface.co/papers/2503.20783) to eliminate length bias. + The value of the constant corresponds to `max_completion_length`. + mask_truncated_completions (`bool`, *optional*, defaults to `False`): + When enabled, truncated completions are excluded from the loss calculation, preventing them from being + incorrectly penalized and introducing noise during training. According to the + [DAPO](https://huggingface.co/papers/2503.14476) paper, this is a good practice for training stability. + sync_ref_model (`bool`, *optional*, defaults to `False`): + Whether to synchronize the reference model with the active model every `ref_model_sync_steps` steps, using + the `ref_model_mixup_alpha` parameter. This synchronization originates from the + [TR-DPO](https://huggingface.co/papers/2404.09656) paper. + ref_model_mixup_alpha (`float`, *optional*, defaults to `0.6`): + α parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which controls the mix + between the current policy and the previous reference policy during updates. The reference policy is + updated according to the equation: `π_ref = α * π_θ + (1 - α) * π_ref_prev`. To use this parameter, you + must set `sync_ref_model=True`. + ref_model_sync_steps (`int`, *optional*, defaults to `512`): + τ parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which determines how + frequently the current policy is synchronized with the reference policy. To use this parameter, you must + set `sync_ref_model=True`. + use_liger_loss (`bool`, *optional*, defaults to `False`): + Whether to use the Liger GRPO loss. + + > Parameters that control the logging + + log_completions (`bool`, *optional*, defaults to `False`): + Whether to log a sample of (prompt, completion) pairs every `logging_steps` steps. If `rich` is installed, + it prints the sample. If `wandb` logging is enabled, it logs it to `wandb`. + num_completions_to_print (`int` or `None`, *optional*, defaults to `None`): + Number of completions to print with `rich`. If `None`, all completions are logged. + wandb_log_unique_prompts (`bool`, *optional*, defaults to `False`): + Whether to log unique prompts in wandb. If `True`, only unique prompts are logged. If `False`, all prompts + are logged. + + """ + vllm_sampling_params: Optional[Any] = field( + default = None, + metadata = {'help': 'vLLM SamplingParams'}, + ) + unsloth_num_chunks : Optional[int] = field( + default = -1, + metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'}, + ) + def __init__( + self, + output_dir = None, + overwrite_output_dir = None, + do_train = False, + do_eval = False, + do_predict = False, + eval_strategy = 'no', + prediction_loss_only = False, + per_device_train_batch_size = 4, + per_device_eval_batch_size = 4, + per_gpu_train_batch_size = None, + per_gpu_eval_batch_size = None, + gradient_accumulation_steps = 2, + eval_accumulation_steps = 2, + eval_delay = 0, + torch_empty_cache_steps = 250, + learning_rate = 5e-05, + weight_decay = 0.01, + adam_beta1 = 0.9, + adam_beta2 = 0.999, + adam_epsilon = 1e-08, + max_grad_norm = 1.0, + num_train_epochs = 3.0, + max_steps = -1, + lr_scheduler_type = 'linear', + warmup_ratio = 0.1, + warmup_steps = 0, + log_level = 'passive', + log_level_replica = 'warning', + log_on_each_node = True, + logging_dir = None, + logging_strategy = 'steps', + logging_first_step = False, + logging_steps = 1, + logging_nan_inf_filter = False, + save_strategy = 'steps', + save_steps = 500, + save_total_limit = None, + save_safetensors = True, + save_on_each_node = False, + save_only_model = False, + restore_callback_states_from_checkpoint = False, + no_cuda = False, + use_cpu = False, + use_mps_device = False, + seed = 3407, + data_seed = 3407, + jit_mode_eval = False, + use_ipex = False, + bf16 = False, + fp16 = False, + fp16_opt_level = 'O1', + half_precision_backend = 'auto', + bf16_full_eval = False, + fp16_full_eval = False, + tf32 = None, + local_rank = -1, + ddp_backend = None, + tpu_num_cores = None, + tpu_metrics_debug = False, + debug = '', + dataloader_drop_last = False, + eval_steps = None, + dataloader_num_workers = 0, + dataloader_prefetch_factor = None, + past_index = -1, + run_name = None, + disable_tqdm = None, + remove_unused_columns = False, + label_names = None, + load_best_model_at_end = False, + metric_for_best_model = None, + greater_is_better = None, + ignore_data_skip = False, + fsdp = '', + fsdp_min_num_params = 0, + fsdp_config = None, + fsdp_transformer_layer_cls_to_wrap = None, + accelerator_config = None, + deepspeed = None, + label_smoothing_factor = 0.0, + optim = 'adamw_8bit', + optim_args = None, + adafactor = False, + group_by_length = False, + length_column_name = 'length', + report_to = None, + ddp_find_unused_parameters = None, + ddp_bucket_cap_mb = None, + ddp_broadcast_buffers = None, + dataloader_pin_memory = True, + dataloader_persistent_workers = False, + skip_memory_metrics = True, + use_legacy_prediction_loop = False, + push_to_hub = False, + resume_from_checkpoint = None, + hub_model_id = None, + hub_strategy = 'every_save', + hub_token = None, + hub_private_repo = None, + hub_always_push = False, + hub_revision = None, + gradient_checkpointing = False, + gradient_checkpointing_kwargs = None, + include_inputs_for_metrics = False, + eval_do_concat_batches = True, + fp16_backend = 'auto', + push_to_hub_model_id = None, + push_to_hub_organization = None, + push_to_hub_token = None, + mp_parameters = '', + auto_find_batch_size = False, + full_determinism = False, + torchdynamo = None, + ray_scope = 'last', + ddp_timeout = 1800, + torch_compile = False, + torch_compile_backend = None, + torch_compile_mode = None, + include_tokens_per_second = False, + include_num_input_tokens_seen = False, + neftune_noise_alpha = None, + optim_target_modules = None, + batch_eval_metrics = False, + eval_on_start = False, + use_liger_kernel = False, + liger_kernel_config = None, + eval_use_gather_object = False, + average_tokens_across_devices = False, + model_init_kwargs = None, + disable_dropout = False, + max_prompt_length = 512, + num_generations = 8, + max_completion_length = 256, + ds3_gather_for_generation = True, + shuffle_dataset = True, + generation_batch_size = None, + steps_per_generation = None, + temperature = 1.0, + top_p = 1.0, + top_k = None, + min_p = None, + generation_kwargs = {}, + repetition_penalty = 1.0, + cache_implementation = None, + use_vllm = False, + vllm_server_base_url = None, + vllm_mode = 'colocate', + vllm_guided_decoding_regex = None, + vllm_server_host = '0.0.0.0', + vllm_server_port = 8000, + vllm_server_timeout = 240.0, + vllm_gpu_memory_utilization = 0.3, + vllm_tensor_parallel_size = 1, + beta = 0.001, + num_iterations = 1, + epsilon = 0.2, + delta = None, + epsilon_high = None, + reward_weights = None, + scale_rewards = True, + loss_type = 'bnpo', + mask_truncated_completions = False, + sync_ref_model = False, + ref_model_mixup_alpha = 0.6, + ref_model_sync_steps = 512, + use_liger_loss = False, + log_completions = False, + num_completions_to_print = None, + wandb_log_unique_prompts = False, + vllm_sampling_params = None, + unsloth_num_chunks = -1, + **kwargs, + ): + if learning_rate < 1e-7: raise FloatingPointError(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!') + if learning_rate > 1: raise OverflowError(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!') + if output_dir is None and save_strategy == 'steps' and save_steps == 500: + output_dir = 'unsloth_training_checkpoints' + save_strategy = 'no' + if loss_type.lower() == 'dr_grpo': + loss_type = 'dr_grpo' + elif loss_type.lower() == 'dapo': + loss_type = 'dapo' + if loss_type.lower() == 'dr_grpo': + if scale_rewards == None: + scale_rewards = True + elif scale_rewards == True: + print('Unsloth: The Dr GRPO paper recommends setting `scale_rewards` to False! Will override. Set it to `None` to force False.') + scale_rewards = False + elif loss_type.lower() == 'dapo': + print('Unsloth: The DAPO paper recommends `mask_truncated_completions = True`') + print('Unsloth: The DAPO paper recommends `epsilon_high = 0.28`') + print('Unsloth: The DAPO paper recommends setting `beta = 0.0` to remove the KL term') + mask_truncated_completions = True + epsilon_high = 0.28 + beta = 0.0 + loss_type = 'bnpo' + + if (per_device_train_batch_size // num_generations) * num_generations != per_device_train_batch_size: + print('Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.\nWe will change the batch size of ' + str(per_device_train_batch_size) + ' to the `num_generations` of ' + str(num_generations)) + per_device_train_batch_size = num_generations + + if temperature <= 0: + raise MathError('Unsloth: Please set a positive non-zero temperature since your results will be wrong.') + elif temperature >= 10: + raise MathError('Unsloth: Please set a positive non-zero temperature less than 10, since sampling will be quite erratic.') + + + super().__init__( + output_dir = output_dir, + overwrite_output_dir = overwrite_output_dir, + do_train = do_train, + do_eval = do_eval, + do_predict = do_predict, + eval_strategy = eval_strategy, + prediction_loss_only = prediction_loss_only, + per_device_train_batch_size = per_device_train_batch_size, + per_device_eval_batch_size = per_device_eval_batch_size, + per_gpu_train_batch_size = per_gpu_train_batch_size, + per_gpu_eval_batch_size = per_gpu_eval_batch_size, + gradient_accumulation_steps = gradient_accumulation_steps, + eval_accumulation_steps = eval_accumulation_steps, + eval_delay = eval_delay, + torch_empty_cache_steps = torch_empty_cache_steps, + learning_rate = learning_rate, + weight_decay = weight_decay, + adam_beta1 = adam_beta1, + adam_beta2 = adam_beta2, + adam_epsilon = adam_epsilon, + max_grad_norm = max_grad_norm, + num_train_epochs = num_train_epochs, + max_steps = max_steps, + lr_scheduler_type = lr_scheduler_type, + warmup_ratio = warmup_ratio, + warmup_steps = warmup_steps, + log_level = log_level, + log_level_replica = log_level_replica, + log_on_each_node = log_on_each_node, + logging_dir = logging_dir, + logging_strategy = logging_strategy, + logging_first_step = logging_first_step, + logging_steps = logging_steps, + logging_nan_inf_filter = logging_nan_inf_filter, + save_strategy = save_strategy, + save_steps = save_steps, + save_total_limit = save_total_limit, + save_safetensors = save_safetensors, + save_on_each_node = save_on_each_node, + save_only_model = save_only_model, + restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint, + no_cuda = no_cuda, + use_cpu = use_cpu, + use_mps_device = use_mps_device, + seed = seed, + data_seed = data_seed, + jit_mode_eval = jit_mode_eval, + use_ipex = use_ipex, + bf16 = bf16, + fp16 = fp16, + fp16_opt_level = fp16_opt_level, + half_precision_backend = half_precision_backend, + bf16_full_eval = bf16_full_eval, + fp16_full_eval = fp16_full_eval, + tf32 = tf32, + local_rank = local_rank, + ddp_backend = ddp_backend, + tpu_num_cores = tpu_num_cores, + tpu_metrics_debug = tpu_metrics_debug, + debug = debug, + dataloader_drop_last = dataloader_drop_last, + eval_steps = eval_steps, + dataloader_num_workers = dataloader_num_workers, + dataloader_prefetch_factor = dataloader_prefetch_factor, + past_index = past_index, + run_name = run_name, + disable_tqdm = disable_tqdm, + remove_unused_columns = remove_unused_columns, + label_names = label_names, + load_best_model_at_end = load_best_model_at_end, + metric_for_best_model = metric_for_best_model, + greater_is_better = greater_is_better, + ignore_data_skip = ignore_data_skip, + fsdp = fsdp, + fsdp_min_num_params = fsdp_min_num_params, + fsdp_config = fsdp_config, + fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap, + accelerator_config = accelerator_config, + deepspeed = deepspeed, + label_smoothing_factor = label_smoothing_factor, + optim = optim, + optim_args = optim_args, + adafactor = adafactor, + group_by_length = group_by_length, + length_column_name = length_column_name, + report_to = report_to, + ddp_find_unused_parameters = ddp_find_unused_parameters, + ddp_bucket_cap_mb = ddp_bucket_cap_mb, + ddp_broadcast_buffers = ddp_broadcast_buffers, + dataloader_pin_memory = dataloader_pin_memory, + dataloader_persistent_workers = dataloader_persistent_workers, + skip_memory_metrics = skip_memory_metrics, + use_legacy_prediction_loop = use_legacy_prediction_loop, + push_to_hub = push_to_hub, + resume_from_checkpoint = resume_from_checkpoint, + hub_model_id = hub_model_id, + hub_strategy = hub_strategy, + hub_token = hub_token, + hub_private_repo = hub_private_repo, + hub_always_push = hub_always_push, + hub_revision = hub_revision, + gradient_checkpointing = gradient_checkpointing, + gradient_checkpointing_kwargs = gradient_checkpointing_kwargs, + include_inputs_for_metrics = include_inputs_for_metrics, + eval_do_concat_batches = eval_do_concat_batches, + fp16_backend = fp16_backend, + push_to_hub_model_id = push_to_hub_model_id, + push_to_hub_organization = push_to_hub_organization, + push_to_hub_token = push_to_hub_token, + mp_parameters = mp_parameters, + auto_find_batch_size = auto_find_batch_size, + full_determinism = full_determinism, + torchdynamo = torchdynamo, + ray_scope = ray_scope, + ddp_timeout = ddp_timeout, + torch_compile = torch_compile, + torch_compile_backend = torch_compile_backend, + torch_compile_mode = torch_compile_mode, + include_tokens_per_second = include_tokens_per_second, + include_num_input_tokens_seen = include_num_input_tokens_seen, + neftune_noise_alpha = neftune_noise_alpha, + optim_target_modules = optim_target_modules, + batch_eval_metrics = batch_eval_metrics, + eval_on_start = eval_on_start, + use_liger_kernel = use_liger_kernel, + liger_kernel_config = liger_kernel_config, + eval_use_gather_object = eval_use_gather_object, + average_tokens_across_devices = average_tokens_across_devices, + model_init_kwargs = model_init_kwargs, + disable_dropout = disable_dropout, + max_prompt_length = max_prompt_length, + num_generations = num_generations, + max_completion_length = max_completion_length, + ds3_gather_for_generation = ds3_gather_for_generation, + shuffle_dataset = shuffle_dataset, + generation_batch_size = generation_batch_size, + steps_per_generation = steps_per_generation, + temperature = temperature, + top_p = top_p, + top_k = top_k, + min_p = min_p, + generation_kwargs = generation_kwargs, + repetition_penalty = repetition_penalty, + cache_implementation = cache_implementation, + use_vllm = use_vllm, + vllm_server_base_url = vllm_server_base_url, + vllm_mode = vllm_mode, + vllm_guided_decoding_regex = vllm_guided_decoding_regex, + vllm_server_host = vllm_server_host, + vllm_server_port = vllm_server_port, + vllm_server_timeout = vllm_server_timeout, + vllm_gpu_memory_utilization = vllm_gpu_memory_utilization, + vllm_tensor_parallel_size = vllm_tensor_parallel_size, + beta = beta, + num_iterations = num_iterations, + epsilon = epsilon, + delta = delta, + epsilon_high = epsilon_high, + reward_weights = reward_weights, + scale_rewards = scale_rewards, + loss_type = loss_type, + mask_truncated_completions = mask_truncated_completions, + sync_ref_model = sync_ref_model, + ref_model_mixup_alpha = ref_model_mixup_alpha, + ref_model_sync_steps = ref_model_sync_steps, + use_liger_loss = use_liger_loss, + log_completions = log_completions, + num_completions_to_print = num_completions_to_print, + wandb_log_unique_prompts = wandb_log_unique_prompts,**kwargs) + self.vllm_sampling_params = vllm_sampling_params + self.unsloth_num_chunks = unsloth_num_chunks +pass + +class _UnslothGRPOTrainer(Trainer): + """""" + + _tag_names = ["trl", "grpo"] + + def __init__( + self, + model: Union[str, PreTrainedModel], + reward_funcs: Union[RewardFunc, list[RewardFunc]], + args: Optional[GRPOConfig] = None, + train_dataset: Optional[Union[Dataset, IterableDataset]] = None, + eval_dataset: Optional[Union[Dataset, IterableDataset, dict[str, Union[Dataset, IterableDataset]]]] = None, + processing_class: Optional[PreTrainedTokenizerBase] = None, + reward_processing_classes: Optional[Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]] = None, + callbacks: Optional[list[TrainerCallback]] = None, + optimizers: tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None), + peft_config: Optional["PeftConfig"] = None, + ): + + if hasattr(model, 'vllm_engine') and hasattr(args, 'use_vllm'): + if (getattr(args, 'use_vllm', False) == False): + args.use_vllm = True + args.vllm_mode='colocate' + # Args + if args is None: + model_name = model if isinstance(model, str) else model.config._name_or_path + model_name = model_name.split("/")[-1] + args = GRPOConfig(f"{model_name}-GRPO") + + # Models + # Trained model + model_init_kwargs = args.model_init_kwargs or {} + if isinstance(model, str): + model_id = model + torch_dtype = model_init_kwargs.get("torch_dtype") + if isinstance(torch_dtype, torch.dtype) or torch_dtype == "auto" or torch_dtype is None: + pass # torch_dtype is already a torch.dtype or "auto" or None + elif isinstance(torch_dtype, str): # it's a str, but not "auto" + torch_dtype = getattr(torch, torch_dtype) + model_init_kwargs["torch_dtype"] = torch_dtype + else: + raise ValueError( + "Invalid `torch_dtype` passed to `GRPOConfig`. Expected either 'auto' or a string representing " + f"a `torch.dtype` (e.g., 'float32'), but got {torch_dtype}." + ) + # Disable caching if gradient checkpointing is enabled [not supported] + model_init_kwargs["use_cache"] = ( + False if args.gradient_checkpointing else model_init_kwargs.get("use_cache") + ) + model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs) + else: + model_id = model.config._name_or_path + if args.model_init_kwargs is not None: + raise ValueError( + "You passed `model_init_kwargs` to the `GRPOConfig`, but your model is already instantiated. " + "This argument can only be used when the `model` argument is a string." + ) + + if False: + if not is_peft_available(): + raise ImportError("PEFT is required to use `peft_config`. Run `pip install peft`.") + model = model + + # Enable gradient checkpointing if requested + if args.gradient_checkpointing: + model = self._enable_gradient_checkpointing(model, args) + + # Processing class + if processing_class is None: + processing_class = AutoTokenizer.from_pretrained(model.config._name_or_path, padding_side="left") + if processing_class.pad_token is None: + processing_class.pad_token = processing_class.eos_token + + # Reward functions + if not isinstance(reward_funcs, list): + reward_funcs = [reward_funcs] + self.reward_func_names = [] + for i, reward_func in enumerate(reward_funcs): + if isinstance(reward_func, str): + reward_funcs[i] = AutoModelForSequenceClassification.from_pretrained( + reward_func, num_labels=1, **model_init_kwargs + ) + if isinstance(reward_funcs[i], nn.Module): # Use Module over PretrainedModel for compat w/ compiled models + self.reward_func_names.append(reward_funcs[i].config._name_or_path.split("/")[-1]) + else: + self.reward_func_names.append(reward_funcs[i].__name__) + self.reward_funcs = reward_funcs + + # Reward weights + if args.reward_weights is not None: + if len(args.reward_weights) != len(reward_funcs): + raise ValueError( + f"Number of reward weights ({len(args.reward_weights)}) must match number of reward " + f"functions ({len(reward_funcs)})" + ) + self.reward_weights = torch.tensor(args.reward_weights, dtype=torch.float32) + else: + self.reward_weights = torch.ones(len(reward_funcs), dtype=torch.float32) + + # Reward processing class + if reward_processing_classes is None: + reward_processing_classes = [None] * len(reward_funcs) + elif not isinstance(reward_processing_classes, list): + reward_processing_classes = [reward_processing_classes] + else: + if len(reward_processing_classes) != len(reward_funcs): + raise ValueError("The number of reward processing classes must match the number of reward functions.") + + for i, (reward_processing_class, reward_func) in enumerate(zip(reward_processing_classes, reward_funcs)): + if isinstance(reward_func, PreTrainedModel): + if reward_processing_class is None: + reward_processing_class = AutoTokenizer.from_pretrained(reward_func.config._name_or_path) + if reward_processing_class.pad_token_id is None: + reward_processing_class.pad_token = reward_processing_class.eos_token + # The reward model computes the reward for the latest non-padded token in the input sequence. + # So it's important to set the pad token ID to the padding token ID of the processing class. + reward_func.config.pad_token_id = reward_processing_class.pad_token_id + reward_processing_classes[i] = reward_processing_class + self.reward_processing_classes = reward_processing_classes + + # Training arguments + self.max_prompt_length = args.max_prompt_length + self.max_completion_length = args.max_completion_length # = |o_i| in the GRPO paper + self.num_generations = args.num_generations # = G in the GRPO paper + self.temperature = args.temperature + self.top_p = args.top_p + self.top_k = args.top_k + self.min_p = args.min_p + self.repetition_penalty = args.repetition_penalty + self.use_vllm = args.use_vllm + self.vllm_mode = args.vllm_mode + self.vllm_gpu_memory_utilization = args.vllm_gpu_memory_utilization # only applies to colocation mode + self.vllm_tensor_parallel_size = args.vllm_tensor_parallel_size # only applies to colocation mode + self.use_liger_loss = args.use_liger_loss + self.loss_type = args.loss_type + self.scale_rewards = args.scale_rewards + self.mask_truncated_completions = args.mask_truncated_completions + + # Datasets + self.shuffle_dataset = args.shuffle_dataset + + if ( + isinstance(train_dataset, IterableDataset) + or isinstance(eval_dataset, IterableDataset) + or ( + isinstance(eval_dataset, dict) and any(isinstance(ds, IterableDataset) for ds in eval_dataset.values()) + ) + ): + # See https://github.com/huggingface/trl/issues/3213 + raise NotImplementedError( + "Iterable datasets are not yet supported in GRPOTrainer. Please use a standard dataset instead." + ) + + # Multi-step + self.num_iterations = args.num_iterations # = 𝜇 in the GRPO paper + self.epsilon_low = args.epsilon + self.epsilon_high = args.epsilon_high if args.epsilon_high is not None else args.epsilon + # Tracks the number of iterations [forward + backward passes], including those within a grad accum cycle + self._step = 0 + # Buffer the batch to reuse generated outputs across multiple updates. For more details, see + # `_get_train_sampler` and `_prepare_inputs`. + self._buffered_inputs = None + + # The trainer estimates the number of FLOPs [floating-point operations] using the number of elements in the + # input tensor associated with the key "input_ids". However, in GRPO, the sampled data does not include the + # "input_ids" key. Instead, the available keys is "prompt". As a result, the trainer issues the warning: + # "Could not estimate the number of tokens of the input, floating-point operations will not be computed." To + # suppress this warning, we set the "estimate_tokens" key in the model's "warnings_issued" dictionary to True. + # This acts as a flag to indicate that the warning has already been issued. + model.warnings_issued["estimate_tokens"] = True + + super().__init__( + model=model, + args=args, + data_collator=identity, # No data collation is needed in GRPO + train_dataset=train_dataset, + eval_dataset=eval_dataset, + processing_class=processing_class, + callbacks=callbacks, + optimizers=optimizers, + ) + + # Reference model + self.beta = args.beta + if self.beta == 0.0: + # If beta is 0.0, the reference model is not needed + self.ref_model = None + elif is_peft_model(model): + # If PEFT is used, the reference model is not needed since the adapter can be disabled + # to revert to the initial model. + self.ref_model = None + else: + # For deepspeed, fsdp or non-distributed models, create a reference model from scratch + self.ref_model = AutoModelForCausalLM.from_pretrained(model_id, **model_init_kwargs) + + # Disable dropout in the models + if args.disable_dropout: + disable_dropout_in_model(model) + if self.ref_model is not None: + disable_dropout_in_model(self.ref_model) + + # Liger loss + if self.use_liger_loss: + if not is_liger_kernel_available(): + raise ImportError( + "Liger is required to use `liger_loss` as the GRPO loss. Run `pip install liger-kernel`." + ) + # redirect the model.module forward to the model forward to ensure pre-forward hooks are called + self._forward_redirection = _ForwardRedirection() + + self.liger_grpo_loss = LigerFusedLinearGRPOLoss( + beta=self.beta, + epsilon_low=self.epsilon_low, + epsilon_high=self.epsilon_high, + temperature=self.temperature, + use_ref_model=self.beta != 0.0, + loss_type=self.loss_type, + max_completion_length=self.max_completion_length, + ) + + # Initialize the metrics + self._metrics = {"train": defaultdict(list), "eval": defaultdict(list)} + self._total_train_tokens = 0 + self.log_completions = args.log_completions + self.wandb_log_unique_prompts = args.wandb_log_unique_prompts + self.num_completions_to_print = args.num_completions_to_print + # maxlen is set to the total number of forward passes per step. This value of `maxlen` ensures we log only the + # final optimization step. + maxlen = self.accelerator.num_processes * args.per_device_train_batch_size * args.steps_per_generation + self._textual_logs = { + "prompt": deque(maxlen=maxlen), + "completion": deque(maxlen=maxlen), + "rewards": defaultdict(lambda: deque(maxlen=maxlen)), + "advantages": deque(maxlen=maxlen), + } + + # Ensure each process receives a unique seed to prevent duplicate completions when generating with + # transformers if num_generations exceeds per_device_train_batch_size. We could skip it if we use vLLM, but + # it's safer to set it in all cases. + set_seed(args.seed, device_specific=True) + + if self.use_vllm: + if not is_vllm_available(): + raise ImportError( + "vLLM is not available and `use_vllm` is set to True. Please install vLLM with " + "`pip install vllm` to use it." + ) + + if self.vllm_mode == "server" and self.accelerator.is_main_process: + if args.vllm_server_base_url is not None: + base_url = args.vllm_server_base_url + else: + base_url = f"http://{args.vllm_server_host}:{args.vllm_server_port}" + self.vllm_client = VLLMClient(base_url=base_url, connection_timeout=args.vllm_server_timeout) + self.vllm_client.init_communicator() + + elif self.vllm_mode == "colocate": + if not self.accelerator.num_processes % self.vllm_tensor_parallel_size == 0: + raise ValueError( + f"vllm_tensor_parallel_size ({self.vllm_tensor_parallel_size}) must divide world size " + f"({self.accelerator.num_processes}) evenly." + ) + + if self.vllm_tensor_parallel_size > 1: + self.tp_group, _ = torch.distributed.new_subgroups_by_enumeration( + [ + list(range(i * self.vllm_tensor_parallel_size, (i + 1) * self.vllm_tensor_parallel_size)) + for i in range(self.accelerator.num_processes // self.vllm_tensor_parallel_size) + ] + ) + + self.llm = model.vllm_engine + self.guided_decoding_regex = args.vllm_guided_decoding_regex + + self._last_loaded_step = -1 + self.accelerator.wait_for_everyone() + else: + generation_kwargs = { + "max_new_tokens": self.max_completion_length, + "do_sample": True, + "pad_token_id": processing_class.pad_token_id, + "bos_token_id": processing_class.bos_token_id, + "eos_token_id": processing_class.eos_token_id, + "temperature": self.temperature, + "top_p": self.top_p, + "top_k": self.top_k, + "min_p": self.min_p, + "repetition_penalty": self.repetition_penalty, + "cache_implementation": args.cache_implementation, + } + if args.generation_kwargs is not None: + generation_kwargs.update(args.generation_kwargs) + self.generation_config = GenerationConfig(**generation_kwargs) + + # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the + # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set + # self.model_accepts_loss_kwargs to False to enable scaling. + self.model_accepts_loss_kwargs = False + + # Add tags to the model + self.model.add_model_tags(self._tag_names) + + if self.ref_model is not None: + if self.is_deepspeed_enabled: + self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator) + elif self.is_fsdp_enabled: + self.ref_model = prepare_fsdp(self.ref_model, self.accelerator) + else: + self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True) + + if args.sync_ref_model: + self.add_callback(SyncRefModelCallback(ref_model=self.ref_model, accelerator=self.accelerator)) + + for i, reward_func in enumerate(self.reward_funcs): + if isinstance(reward_func, PreTrainedModel): + if self.is_deepspeed_enabled: + self.reward_funcs[i] = prepare_deepspeed(reward_func, self.accelerator) + else: + # set device placement to True to make `prepare_model` move `reward_func` to device when using fsdp + self.reward_funcs[i] = self.accelerator.prepare_model( + reward_func, evaluation_mode=True, device_placement=True + ) + + def _set_signature_columns_if_needed(self): + # If `self.args.remove_unused_columns` is True, non-signature columns are removed. + # By default, this method sets `self._signature_columns` to the model's expected inputs. + # In GRPOTrainer, we preprocess data, so using the model's signature columns doesn't work. + # Instead, we set them to the columns expected by the `training_step` method, hence the override. + if self._signature_columns is None: + self._signature_columns = ["prompt"] + + # This method overrides `Trainer.get_train_dataloader` to support our custom batching strategy. + # Instead of returning a standard per-step batch (i.e., `per_device_batch_size), our dataloader loads an + # *generation* batch (i.e., `per_device_batch_size × steps_per_generation`). This allows us to generate completions + # once every steps_per_generation step—rather than once per accumulation step—which is significantly more + # efficient. The only change from the original implementation is multiplying the batch size by + # `steps_per_generation`. Thus, `_prepare_inputs` is called with this *generation* batch, and it handles the + # splitting internally. + # Maintenance note: This method is a copy-paste of the original `Trainer.get_train_dataloader` with only one line + # modification. As a result, some parts of the method aren't relevant to GRPO, but we keep them to stay one line + # apart from the super method, ensuring easier maintenance in the future. + def get_train_dataloader(self): + if self.train_dataset is None: + raise ValueError("Trainer: training requires a train_dataset.") + + train_dataset = self.train_dataset + data_collator = self.data_collator + if is_datasets_available() and isinstance(train_dataset, datasets.Dataset): + train_dataset = self._remove_unused_columns(train_dataset, description="training") + else: + data_collator = self._get_collator_with_removed_columns(data_collator, description="training") + + dataloader_params = { + "batch_size": self._train_batch_size * self.args.steps_per_generation, # < this is the change + "collate_fn": data_collator, + "num_workers": self.args.dataloader_num_workers, + "pin_memory": self.args.dataloader_pin_memory, + "persistent_workers": self.args.dataloader_persistent_workers, + } + + if not isinstance(train_dataset, torch.utils.data.IterableDataset): + dataloader_params["sampler"] = self._get_train_sampler() + dataloader_params["drop_last"] = self.args.dataloader_drop_last + if version.parse(transformers.__version__) >= version.parse("4.52.0"): + # from transformers 4.52.0, the `seed_worker` requires the `num_workers` and `rank` arguments + dataloader_params["worker_init_fn"] = partial( + seed_worker, num_workers=self.args.dataloader_num_workers, rank=self.args.process_index + ) + else: + dataloader_params["worker_init_fn"] = seed_worker + dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor + + return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params)) + + def _get_train_sampler(self, dataset: Optional[Dataset] = None) -> Sampler: + # Returns a sampler that + # 1. ensures each prompt is repeated across multiple processes. This guarantees that identical prompts are + # distributed to different GPUs, allowing rewards to be computed and normalized correctly within each prompt + # group. Using the same seed across processes ensures consistent prompt assignment, preventing discrepancies + # in group formation. + # 2. repeats the batch multiple times to allow reusing generations across multiple updates. Refer to + # _prepare_inputs to see how the generations are stored and reused. + + # In the following figure, the values are the prompt indices. The first row shows the first sampled batch, the + # second row shows the second sampled batch, and so on. + # + # | GPU 0 | GPU 1 | + # + # global_step step <-───> num_generations=2 + # <-───────> per_device_train_batch_size=3 + # grad_accum ▲ ▲ 0 0 0 0 1 1 2 2 <- Generate for the first `steps_per_generation` (prompts 0 to 11); store the completions; use the first slice to compute the loss + # =2 ▼ | 0 1 3 3 4 4 5 5 <- Take the stored generations and use the second slice to compute the loss + # | + # | 1 2 6 6 7 7 8 8 <- Take the stored generations and use the third slice to compute the loss + # steps_per_gen=4 ▼ 1 3 9 9 10 10 11 11 <- Take the stored generations and use the fourth slice to compute the loss + # + # 2 4 12 12 13 13 14 14 <- Generate for the second `steps_per_generation` (prompts 12 to 23); store the completions; use the first slice to compute the loss + # 2 5 15 15 16 16 17 17 <- Take the stored generations and use the second slice to compute the loss + # ... + if dataset is None: + dataset = self.train_dataset + return RepeatSampler( + data_source=dataset, + mini_repeat_count=self.num_generations, + batch_size=self.args.generation_batch_size // self.num_generations, + repeat_count=self.num_iterations * self.args.steps_per_generation, + shuffle=self.shuffle_dataset, + seed=self.args.seed, + ) + + def _get_eval_sampler(self, eval_dataset) -> Sampler: + # See _get_train_sampler for an explanation of the sampler. + return RepeatSampler( + data_source=eval_dataset, + mini_repeat_count=self.num_generations, + seed=self.args.seed, + ) + + def _enable_gradient_checkpointing(self, model: PreTrainedModel, args: GRPOConfig) -> PreTrainedModel: + """Enables gradient checkpointing for the model.""" + # Ensure use_cache is disabled + model.config.use_cache = False + + # Enable gradient checkpointing on the base model for PEFT + if is_peft_model(model): + model.base_model.gradient_checkpointing_enable() + # Enable gradient checkpointing for non-PEFT models + else: + model.gradient_checkpointing_enable() + + gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs or {} + use_reentrant = ( + "use_reentrant" not in gradient_checkpointing_kwargs or gradient_checkpointing_kwargs["use_reentrant"] + ) + + if use_reentrant: + model.enable_input_require_grads() + + return model + + @profiling_decorator + def _get_last_hidden_state(self, unwrapped_model, input_ids, attention_mask, logits_to_keep=None): + if is_peft_model(unwrapped_model): + unwrapped_model = unwrapped_model.base_model.model + last_hidden_state = unwrapped_model.model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state + last_hidden_state = last_hidden_state[:, :-1, :] # (B, L-1, H) + if logits_to_keep is not None: + last_hidden_state = last_hidden_state[:, -logits_to_keep:, :] # (B, logits_to_keep, H) + return last_hidden_state + + # Get the per-token log probabilities for the completions for the model and the reference model + def _get_per_token_logps(self, model, input_ids, attention_mask, logits_to_keep): + if True: # os.environ.get('UNSLOTH_USE_NEW_MODEL', '0') == '0': + return None # Unsloth efficient GRPO + # Otherwise, calculate normally: + if not hasattr(self, '_autocast_dtype'): + self._autocast_dtype = torch.float16 if os.environ.get('ACCELERATE_MIXED_PRECISION', 'fp16') == 'fp16' else torch.bfloat16 + if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1': self._autocast_dtype = torch.float16 + + os.environ["UNSLOTH_RETURN_HIDDEN_STATES"] = "1" + with torch.amp.autocast(device_type = DEVICE_TYPE, dtype = self._autocast_dtype): + # We add 1 to `logits_to_keep` because the last logits of the sequence is later excluded + logits = model( + input_ids = input_ids, + attention_mask = attention_mask, + logits_to_keep = logits_to_keep + 1, + ).logits + # logits = logits[:, :-1, :] # (B, L-1, V), exclude the last logit: it corresponds to the next token pred + return logits + # input_ids = input_ids[:, -logits_to_keep:] + # For transformers<=4.48, logits_to_keep argument isn't supported, so here we drop logits ourselves. + # See https://github.com/huggingface/trl/issues/2770 + # logits = logits[:, -logits_to_keep:] + # return logits + # See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details + # logits = logits / self.temperature + # logps = selective_log_softmax(logits, input_ids) + + # row_indices, col_indices = torch.where(logps < -20) + + # # Method 1: Check if tensors have elements + # if len(row_indices) > 0 and len(col_indices) > 0: + # breakpoint() # Breakpoint triggered here + # print("Found high values!") + # return logps # compute logprobs for the input tokens + pass + + def _sync_fsdp_params_to_vllm(self, module: nn.Module, prefix: str = "", visited=None): + """Memory-efficient post-order traversal of FSDP modules to extract full parameters and sync with vLLM.""" + if visited is None: + visited = set() + + for child_name, child_module in module.named_children(): + child_prefix = f"{prefix}.{child_name}" if prefix else child_name + self._sync_fsdp_params_to_vllm( + child_module, prefix=child_prefix, visited=visited + ) # recurse into the child + + if isinstance(module, FSDP): + with FSDP.summon_full_params(module, recurse=False, writeback=False): + for param_name, param in module.named_parameters(): + full_name = f"{prefix}.{param_name}" if prefix else param_name + for extra in ("_fsdp_wrapped_module.", "_checkpoint_wrapped_module."): + full_name = full_name.replace(extra, "") + + if full_name in visited: + continue # skip FSDP subtrees already traversed + visited.add(full_name) + + if self.vllm_mode == "server" and self.accelerator.is_main_process: + self.vllm_client.update_named_param(full_name, param.data) + elif self.vllm_mode == "colocate": + + pass + + pass + + def _move_model_to_vllm(self, *args, **kwargs): return None + + @profiling_decorator + def _prepare_inputs( + self, generation_batch: dict[str, Union[torch.Tensor, Any]] + ) -> dict[str, Union[torch.Tensor, Any]]: + # Prepares inputs for model training/evaluation by managing completion generation and batch handling. + # During training: + # - Receives the local generation batch (Per-GPU batch size × steps per generation) + # from the modified training dataloader instead of the standard local batch + # - Generates completions once for the entire generation batch and splits it into batches of size + # `per_device_train_batch_size` + # - Buffers these completions and returns the appropriate slice for the current accumulation step + # - Optimizes by regenerating completions only periodically (every steps_per_generation * num_iterations) + # During evaluation: + # - The input is treated as a standard local batch (no accumulation, no multiple iterations) + # - Completions are generated for each batch without buffering or reuse + # Returns a single local batch in both cases. + if hasattr(self, 'llm'): + if getattr(self.llm.llm_engine.vllm_config.model_config, 'enable_sleep_mode', False): + self.llm.wake_up() + + mode = "train" if self.model.training else "eval" + if mode == "train": + generate_every = self.args.steps_per_generation * self.num_iterations + if self._step % generate_every == 0 or self._buffered_inputs is None: + # self._buffered_inputs=None can occur when resuming from a checkpoint + generation_batch = self._generate_and_score_completions(generation_batch) + generation_batch = shuffle_tensor_dict(generation_batch) + self._buffered_inputs = split_tensor_dict(generation_batch, self.args.steps_per_generation) + inputs = self._buffered_inputs[self._step % self.args.steps_per_generation] + self._step += 1 + else: + # In evaluation, there is neither batch grouping for generation, nor multiple iterations, hence + # local generation batch == local eval batch + inputs = self._generate_and_score_completions(generation_batch) + if hasattr(self, 'llm'): + if getattr(self.llm.llm_engine.vllm_config.model_config, 'enable_sleep_mode', False): + self.llm.sleep(os.environ.get('VLLM_SLEEP_MODE', 1)) + return inputs + + @profiling_decorator + def _calculate_rewards(self, inputs, prompts, completions, completion_ids_list): + device = self.accelerator.device + rewards_per_func = torch.zeros(len(prompts), len(self.reward_funcs), device=device) + + # Repeat all input columns (but "prompt", "completion", and "completion_ids") to match the num of generations + keys = [key for key in inputs[0] if key not in ["prompt", "completion", "completion_ids"]] + reward_kwargs = {key: [example[key] for example in inputs] for key in keys} + + for i, (reward_func, reward_processing_class, reward_func_name) in enumerate( + zip(self.reward_funcs, self.reward_processing_classes, self.reward_func_names) + ): + with profiling_context(self, reward_func_name): + if isinstance(reward_func, nn.Module): # Module (no PretrainedModel) for compat with compiled models + if is_conversational(inputs[0]): + messages = [{"messages": p + c} for p, c in zip(prompts, completions)] + texts = [apply_chat_template(x, reward_processing_class)["text"] for x in messages] + else: + texts = [p + c for p, c in zip(prompts, completions)] + reward_inputs = reward_processing_class( + text=texts, return_tensors="pt", padding=True, padding_side="right", add_special_tokens=False + ) + reward_inputs = super()._prepare_inputs(reward_inputs) + with torch.inference_mode(): + rewards_per_func[:, i] = reward_func(**reward_inputs).logits[:, 0] # Shape (B*G,) + else: + output_reward_func = reward_func( + prompts=prompts, completions=completions, completion_ids=completion_ids_list, **reward_kwargs + ) + # Convert None values to NaN + output_reward_func = [reward if reward is not None else torch.nan for reward in output_reward_func] + + rewards_per_func[:, i] = torch.tensor(output_reward_func, dtype=torch.float32, device=device) + + # If all reward functions return None for a given row, issue a detailed warning + if torch.isnan(rewards_per_func).all(dim=1).any(): + nan_row_idx = torch.isnan(rewards_per_func).all(dim=1).nonzero(as_tuple=True)[0][0] + row_reward_kwargs = {key: value[nan_row_idx] for key, value in reward_kwargs.items()} + row_reward_kwargs["prompt"] = prompts[nan_row_idx] + row_reward_kwargs["completion"] = completions[nan_row_idx] + warnings.warn( + f"All reward functions returned None for the following kwargs: {row_reward_kwargs}. " + "Please ensure that at least one reward function returns a valid reward." + ) + + # Gather the reward per function: this part is crucial, because the rewards are normalized per group and the + # completions may be distributed across processes + rewards_per_func = gather(rewards_per_func) + return rewards_per_func + + def _generate_and_score_completions( + self, inputs: list[dict[str, Union[torch.Tensor, Any]]] + ) -> dict[str, Union[torch.Tensor, Any]]: + device = self.accelerator.device + mode = "train" if self.model.training else "eval" + + prompts = [x["prompt"] for x in inputs] + prompts_text = [maybe_apply_chat_template(example, self.processing_class)["prompt"] for example in inputs] + prompt_inputs = self.processing_class( + text=prompts_text, return_tensors="pt", padding=True, padding_side="left", add_special_tokens=False + ) + prompt_inputs = super()._prepare_inputs(prompt_inputs) + prompt_ids, prompt_mask = prompt_inputs["input_ids"], prompt_inputs["attention_mask"] + + if self.max_prompt_length is not None: + # If max_prompt_length is set, we trim the prompt to keep only the last `max_prompt_length` tokens. + # Then we decode those tokens back into text. We manually remove leading pad tokens from the decoded text, + # because we can't use `skip_special_tokens=True` (some special tokens are still needed for generation). + prompt_ids = prompt_ids[:, -self.max_prompt_length :] + prompt_mask = prompt_mask[:, -self.max_prompt_length :] + prompts_text = self.processing_class.batch_decode( + prompt_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False + ) + prompts_text = [ + re.sub(rf"^({re.escape(self.processing_class.pad_token)})+", "", text) for text in prompts_text + ] + + # Generate completions using either vLLM or regular generation + if self.use_vllm: + # First, update the vLLM weights if needed + if self.state.global_step != self._last_loaded_step: + self._move_model_to_vllm() + self._last_loaded_step = self.state.global_step + + # Generate completions using vLLM: gather all prompts and use them in a single call in the main process + if self.vllm_mode == "server": + all_prompts_text = gather_object(prompts_text) + if self.accelerator.is_main_process: + # Since 'prompts' contains 'num_generations' duplicates, we first take unique prompts, and generate + # num_generations outputs for each one. This is faster than generating outputs for each duplicate + # prompt individually. + ordered_set_of_prompts = all_prompts_text[:: self.num_generations] + with profiling_context(self, "vLLM.generate"): + completion_ids = self.vllm_client.generate( + prompts=ordered_set_of_prompts, + n=self.num_generations, + repetition_penalty=self.repetition_penalty, + temperature=self.temperature, + top_p=self.top_p, + top_k=-1 if self.top_k is None else self.top_k, + min_p=0.0 if self.min_p is None else self.min_p, + max_tokens=self.max_completion_length, + guided_decoding_regex=self.guided_decoding_regex, + generation_kwargs=self.args.generation_kwargs, + ) + else: + completion_ids = [None] * len(all_prompts_text) + # Broadcast the completions from the main process to all processes, ensuring each process receives its + # corresponding slice. + completion_ids = broadcast_object_list(completion_ids, from_process=0) + process_slice = slice( + self.accelerator.process_index * len(prompts), + (self.accelerator.process_index + 1) * len(prompts), + ) + completion_ids = completion_ids[process_slice] + + # Generate completions using colocated vLLM instances: each device holds vLLM copy and work on their own batch of prompts + elif self.vllm_mode == "colocate": + if self.guided_decoding_regex: + guided_decoding = GuidedDecodingParams(backend="outlines", regex=self.guided_decoding_regex) + else: + guided_decoding = None + + generation_kwargs = { + "n": 1, # vLLM on each GPU generates only 1 in colocate mode + "repetition_penalty": self.repetition_penalty, + "temperature": self.temperature, + "top_p": self.top_p, + "top_k": -1 if self.top_k is None else self.top_k, + "min_p": 0.0 if self.min_p is None else self.min_p, + "max_tokens": self.max_completion_length, + "guided_decoding": guided_decoding, + } + if self.args.generation_kwargs is not None: + generation_kwargs.update(self.args.generation_kwargs) + sampling_params = SamplingParams(**generation_kwargs) + + if self.vllm_tensor_parallel_size > 1: + # Gather prompts from all ranks in the TP group and flatten. + # Each rank starts with its own prompts; after gathering, all ranks see the full group set. + orig_size = len(prompts_text) + gathered_prompts = [None for _ in range(self.vllm_tensor_parallel_size)] + torch.distributed.all_gather_object(gathered_prompts, prompts_text, group=self.tp_group) + all_prompts_text = [p for sublist in gathered_prompts for p in sublist] + else: + all_prompts_text = prompts_text + + with profiling_context(self, "vLLM.generate"): + all_outputs = self.llm.generate(all_prompts_text, sampling_params=sampling_params, use_tqdm=False, lora_request = self.model.load_lora('grpo_trainer_lora_model', load_tensors = True)) + + completion_ids = [output.token_ids for outputs in all_outputs for output in outputs.outputs] + + if self.vllm_tensor_parallel_size > 1: + # Slice completions for this rank within its TP group. + # Each rank generates all outputs — we keep only our share. + local_rank_in_group = torch.distributed.get_rank(group=self.tp_group) + tp_slice = slice(local_rank_in_group * orig_size, (local_rank_in_group + 1) * orig_size) + completion_ids = completion_ids[tp_slice] + + # Pad the completions, and concatenate them with the prompts + completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids] + completion_ids = pad(completion_ids, padding_value=self.processing_class.pad_token_id) + prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1) + else: + # Regular generation path + with unwrap_model_for_generation( + self.model_wrapped, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation + ) as unwrapped_model: + with ( + FSDP.summon_full_params(self.model_wrapped, recurse=False) + if self.is_fsdp_enabled + else nullcontext() + ): + prompt_completion_ids = unwrapped_model.generate( + prompt_ids, attention_mask=prompt_mask, generation_config=self.generation_config + ) + + # Compute prompt length and extract completion ids + prompt_length = prompt_ids.size(1) + prompt_ids = prompt_completion_ids[:, :prompt_length] + completion_ids = prompt_completion_ids[:, prompt_length:] + + # Mask everything after the first EOS token + is_eos = completion_ids == self.processing_class.eos_token_id + eos_idx = torch.full((is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device) + eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)] + sequence_indices = torch.arange(is_eos.size(1), device=device).expand(is_eos.size(0), -1) + completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int() + + # Convert tensor to a list of lists of token IDs. This will be passed to the reward function, avoiding the need + # to re-tokenize completions if the reward is computed from tokens. + completion_ids_list = [ + [id.item() for id, m in zip(row, mask_row) if m] for row, mask_row in zip(completion_ids, completion_mask) + ] + + # Sum along sequence dimension (dim=1) to get completion length per sequence, used for logging + completion_lengths = completion_mask.sum(1) + + # If mask_truncated_completions is enabled, zero out truncated completions in completion_mask + if self.mask_truncated_completions: + truncated_completions = ~is_eos.any(dim=1) + completion_mask = completion_mask * (~truncated_completions).unsqueeze(1).int() + + # Concatenate prompt_mask with completion_mask for logit computation + attention_mask = torch.cat([prompt_mask, completion_mask], dim=1) # (B, P+C) + + logits_to_keep = completion_ids.size(1) # we only need to compute the logits for the completion tokens + batch_size = self.args.per_device_train_batch_size if mode == "train" else self.args.per_device_eval_batch_size + + with torch.no_grad(): + # When using num_iterations == 1 and steps_per_generation <= gradient_accumulation_steps + # old_per_token_logps == per_token_logps, so we can skip it's computation here, and use + # per_token_logps.detach() instead. + if self.num_iterations > 1 or self.args.steps_per_generation > self.args.gradient_accumulation_steps: + old_per_token_logps = self._get_per_token_logps( + self.model, prompt_completion_ids, attention_mask, logits_to_keep, batch_size + ) + else: + old_per_token_logps = None + + # Compute the per-token log probabilities for the reference model + if self.beta != 0.0: + if self.ref_model is not None: + ref_per_token_logps = self._get_per_token_logps( + self.ref_model, prompt_completion_ids, attention_mask, logits_to_keep + ) + else: + with self.accelerator.unwrap_model(self.model).disable_adapter(): + ref_per_token_logps = self._get_per_token_logps( + self.model, prompt_completion_ids, attention_mask, logits_to_keep + ) + else: + ref_per_token_logps = None + + # Decode the generated completions + completions_text = self.processing_class.batch_decode(completion_ids, skip_special_tokens=True) + if is_conversational(inputs[0]): + completions = [] + for prompt, completion in zip(prompts, completions_text): + bootstrap = prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else "" + completions.append([{"role": "assistant", "content": bootstrap + completion}]) + else: + completions = completions_text + + # Calculate rewards for each reward function. rewards_per_func aggregates rewards across all processes. This is + # important because rewards will be normalized per group, and completions are distributed. We will later slice + # rewards_per_func to extract each process's subset. + rewards_per_func = self._calculate_rewards(inputs, prompts, completions, completion_ids_list) + + # Apply weights to each reward function's output and sum + rewards = (rewards_per_func * self.reward_weights.to(device).unsqueeze(0)).nansum(dim=1) + + # Compute grouped-wise rewards + mean_grouped_rewards = rewards.view(-1, self.num_generations).mean(dim=1) + std_grouped_rewards = rewards.view(-1, self.num_generations).std(dim=1) + is_std_zero = torch.isclose(std_grouped_rewards, torch.zeros_like(std_grouped_rewards)) + + # Normalize the rewards to compute the advantages + mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(self.num_generations, dim=0) + std_grouped_rewards = std_grouped_rewards.repeat_interleave(self.num_generations, dim=0) + advantages = rewards - mean_grouped_rewards + if self.scale_rewards: + advantages = advantages / (std_grouped_rewards + 1e-4) + + # Slice to keep only the local part of the data + process_slice = slice( + self.accelerator.process_index * len(prompts), + (self.accelerator.process_index + 1) * len(prompts), + ) + all_process_advantages = advantages.clone() # keep the aggregated advantages for logging + advantages = advantages[process_slice] + + # Log the metrics + if mode == "train": + self.state.num_input_tokens_seen += self.accelerator.gather(attention_mask.sum()).sum().item() + self._metrics[mode]["num_tokens"] = [self.state.num_input_tokens_seen] + + # Log completion lengths, mean, min, max + agg_completion_lengths = self.accelerator.gather(completion_lengths) + self._metrics[mode]["completions/mean_length"].append(agg_completion_lengths.float().mean().item()) + self._metrics[mode]["completions/min_length"].append(agg_completion_lengths.float().min().item()) + self._metrics[mode]["completions/max_length"].append(agg_completion_lengths.float().max().item()) + + # Identify sequences that terminated with EOS and log their lengths + agg_terminated_with_eos = self.accelerator.gather(is_eos.any(dim=1)) + term_completion_lengths = agg_completion_lengths[agg_terminated_with_eos] + clipped_completions_ratio = 1 - len(term_completion_lengths) / len(agg_completion_lengths) + self._metrics[mode]["completions/clipped_ratio"].append(clipped_completions_ratio) + if len(term_completion_lengths) == 0: # edge case where no terminated sequences are found + term_completion_lengths = torch.zeros(1, device=device) + self._metrics[mode]["completions/mean_terminated_length"].append(term_completion_lengths.float().mean().item()) + self._metrics[mode]["completions/min_terminated_length"].append(term_completion_lengths.float().min().item()) + self._metrics[mode]["completions/max_terminated_length"].append(term_completion_lengths.float().max().item()) + + # Calculate mean reward per function, but only for samples where the function was applied (non-NaN values) + for i, reward_func_name in enumerate(self.reward_func_names): + mean_rewards = torch.nanmean(rewards_per_func[:, i]).item() + self._metrics[mode][f"rewards/{reward_func_name}/mean"].append(mean_rewards) + std_rewards = nanstd(rewards_per_func[:, i]).item() + self._metrics[mode][f"rewards/{reward_func_name}/std"].append(std_rewards) + self._metrics[mode]["reward"].append(mean_grouped_rewards.mean().item()) + self._metrics[mode]["reward_std"].append(std_grouped_rewards.mean().item()) + self._metrics[mode]["frac_reward_zero_std"].append(is_std_zero.float().mean().item()) + + # Log prompt and completion texts + self._textual_logs["prompt"].extend(gather_object(prompts_text)) + self._textual_logs["completion"].extend(gather_object(completions_text)) + for i, name in enumerate(self.reward_func_names): + self._textual_logs["rewards"][name].extend(rewards_per_func[:, i].tolist()) + self._textual_logs["advantages"].extend(all_process_advantages.tolist()) + + return { + "prompt_ids": prompt_ids, + "prompt_mask": prompt_mask, + "completion_ids": completion_ids, + "completion_mask": completion_mask, + "advantages": advantages, + "old_per_token_logps": old_per_token_logps, + "ref_per_token_logps": ref_per_token_logps, + } + + def compute_liger_loss(self, unwrapped_model, inputs): + # Compute the per-token log probabilities for the model + prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"] + completion_ids, completion_mask = inputs["completion_ids"], inputs["completion_mask"] + input_ids = torch.cat([prompt_ids, completion_ids], dim=1) + attention_mask = torch.cat([prompt_mask, completion_mask], dim=1) + logits_to_keep = completion_ids.size(1) # we only need to compute the logits for the completion tokens + + # get the last hidden state of the model + last_hidden_state = self._get_last_hidden_state(unwrapped_model, input_ids, attention_mask, logits_to_keep) + + # compute loss and metrics using liger grpo loss + loss, metrics = self.liger_grpo_loss( + _input=last_hidden_state, + lin_weight=unwrapped_model.lm_head.weight, + selected_token_ids=completion_ids, + attention_mask=completion_mask, + advantages=inputs["advantages"], + bias=unwrapped_model.lm_head.bias, + old_per_token_logps=inputs["old_per_token_logps"], + ref_per_token_logps=inputs["ref_per_token_logps"], + ) + # Extract metrics from the liger_grpo_loss output + # KL divergence is the first metric when beta is non-zero + mean_kl = metrics[0] if self.beta != 0.0 else None + clip_ratio = metrics[-1] + + mode = "train" if self.model.training else "eval" + if self.beta != 0.0: + self._metrics[mode]["kl"].append(self.accelerator.gather(mean_kl).mean().item()) + self._metrics[mode]["clip_ratio"].append(self.accelerator.gather(clip_ratio).mean().item()) + return loss + + def compute_loss(self, model, inputs, return_outputs = False, num_items_in_batch = None): + if return_outputs: + raise ValueError("The GRPOTrainer does not support returning outputs") + # Compute the per-token log probabilities for the model + + prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"] + completion_ids, completion_mask = inputs["completion_ids"], inputs["completion_mask"] + input_ids = torch.cat([prompt_ids, completion_ids], dim=1) + bsz, qlen = input_ids.shape + attention_mask = torch.cat([prompt_mask, completion_mask], dim=1) + # attention_mask = None + logits_to_keep = completion_ids.size(1) # we only need to compute the logits for the completion tokens + _input_ids = input_ids + _logits_to_keep = logits_to_keep + + get_logps_func = \ + lambda model, input_ids, attention_mask, logits_to_keep, batch_size=None, compute_entropy=False: \ + self._get_per_token_logps(model, input_ids, attention_mask, logits_to_keep) \ + if hasattr(self, "_get_per_token_logps") else \ + self._get_per_token_logps_and_entropies(model, input_ids, attention_mask, logits_to_keep, batch_size, compute_entropy)['logps'] + + per_token_logps = get_logps_func(model, input_ids, attention_mask, logits_to_keep) + + # Compute the KL divergence between the model and the reference model + # _prepare_inputs doesn't return reference log probs anymore. We need to calculate it ourselves. + # https://github.com/huggingface/trl/blob/05bc43e960396581e458195b8388efe6b82cae1f/trl/trainer/grpo_trainer.py#L1328 + if self.beta != 0.0: + with torch.inference_mode(), model.disable_adapter(): + ref_per_token_logps = per_token_logps = get_logps_func(model, input_ids, attention_mask, logits_to_keep) + else: + ref_per_token_logps = None + # per_token_kl = torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1 + # x - x.detach() allows for preserving gradients from x + advantages = inputs["advantages"] + # per_token_loss = torch.exp(per_token_logps - per_token_logps.detach()) * advantages.unsqueeze(1) + # per_token_loss = -(per_token_loss - self.beta * per_token_kl) + # loss = ((per_token_loss * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean() + old_hidden_states = inputs.get("old_per_token_logps", None) + input_ids = input_ids[:, -logits_to_keep:] + + # Get logit softcapping and logit scale + logit_softcapping = getattr(model.config, "final_logit_softcapping", 0) # Gemma + if logit_softcapping is None: logit_softcapping = 0 + logit_scale_multiply = getattr(model.config, "logit_scale", 0) # Cohere + if logit_scale_multiply is None: logit_scale_multiply = 0 + logit_scale_divide = getattr(model.config, "logits_scaling", 0) # Granite + if logit_scale_divide is None: logit_scale_divide = 0 + if per_token_logps is not None: + + if ref_per_token_logps is not None: + ref_per_token_logps = ref_per_token_logps[:, :-1, :] # (B, L-1, V), exclude the last logit: it corresponds to the next token pred + per_token_logps = per_token_logps[:, :-1, :] # (B, L-1, V), exclude the last logit: it corresponds to the next token pred + + loss, completion_length, mean_kl = grpo_compute_loss_slow( + ref_per_token_logps, + per_token_logps, + old_hidden_states, + input_ids, + completion_mask, + self.beta, + advantages, + loss_type = self.args.loss_type, + epsilon_low = self.epsilon_low, + epsilon_high = self.epsilon_high, + max_completion_length = self.args.max_completion_length, + delta = self.args.delta, + temperature = self.args.temperature, + logit_softcapping = logit_softcapping, + logit_scale_multiply = logit_scale_multiply, + logit_scale_divide = logit_scale_divide, + ) + else: + if hasattr(self.args, "loss_type"): + loss, completion_length, mean_kl = grpo_accumulated_loss( + trainer = self, + input_ids = _input_ids, + logits_to_keep = logits_to_keep, + completion_mask = completion_mask, + advantages = advantages, + old_hidden_states = old_hidden_states, + n_chunks = self.args.unsloth_num_chunks, + loss_type = self.args.loss_type, + epsilon_low = self.epsilon_low, + epsilon_high = self.epsilon_high, + max_completion_length = self.args.max_completion_length, + delta = self.args.delta, + temperature = self.args.temperature, + logit_softcapping = logit_softcapping, + logit_scale_multiply = logit_scale_multiply, + logit_scale_divide = logit_scale_divide, + attention_mask = attention_mask, + ) + else: + # to ensure backwards compatibility with trl 0.15.2 and maybe even 0.17 + loss, completion_length, mean_kl = grpo_accumulated_loss( + trainer = self, + input_ids = _input_ids, + logits_to_keep = logits_to_keep, + completion_mask = completion_mask, + advantages = advantages, + old_hidden_states = old_hidden_states, + n_chunks = self.args.unsloth_num_chunks, + temperature = self.args.temperature, + logit_softcapping = logit_softcapping, + logit_scale_multiply = logit_scale_multiply, + logit_scale_divide = logit_scale_divide, + attention_mask = attention_mask, + ) + pass + pass + # Log the metrics + # completion_length = self.accelerator.gather_for_metrics(completion_mask.sum(1)).float().mean().item() + # mean_kl = ((per_token_kl * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean() + # self._metrics["kl"].append(self.accelerator.gather_for_metrics(mean_kl).mean().item()) + if "train" in self._metrics: + mode = "eval" if self.control.should_evaluate else "train" + self._metrics[mode]["completion_length"].append(completion_length.item()) + self._metrics[mode]["kl"].append(mean_kl.item()) + else: + self._metrics["completion_length"].append(completion_length.item()) + self._metrics["kl"].append(mean_kl.item()) + return loss + + def _compute_loss(self, model, inputs): + # Compute the per-token log probabilities for the model + prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"] + completion_ids, completion_mask = inputs["completion_ids"], inputs["completion_mask"] + input_ids = torch.cat([prompt_ids, completion_ids], dim=1) + attention_mask = torch.cat([prompt_mask, completion_mask], dim=1) + logits_to_keep = completion_ids.size(1) # we only need to compute the logits for the completion tokens + + per_token_logps = self._get_per_token_logps(model, input_ids, attention_mask, logits_to_keep) + + # Compute the KL divergence between the model and the reference model + if self.beta != 0.0: + ref_per_token_logps = inputs["ref_per_token_logps"] + per_token_kl = ( + torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1 + ) + + # Compute the loss + advantages = inputs["advantages"] + # When using num_iterations == 1 and steps_per_generation <= gradient_accumulation_steps + # old_per_token_logps == per_token_logps, so we can skip it's computation + # (see _generate_and_score_completions) and use per_token_logps.detach() instead. + old_per_token_logps = ( + per_token_logps.detach() if inputs["old_per_token_logps"] is None else inputs["old_per_token_logps"] + ) + coef_1 = torch.exp(per_token_logps - old_per_token_logps) + coef_2 = torch.clamp(coef_1, 1 - self.epsilon_low, 1 + self.epsilon_high) + + # Two-sided clipping + if self.args.delta is not None: + coef_1 = torch.clamp(coef_1, max=self.args.delta) + + per_token_loss1 = coef_1 * advantages.unsqueeze(1) + per_token_loss2 = coef_2 * advantages.unsqueeze(1) + per_token_loss = -torch.min(per_token_loss1, per_token_loss2) + if self.beta != 0.0: + per_token_loss = per_token_loss + self.beta * per_token_kl + + if self.loss_type == "grpo": + loss = ((per_token_loss * completion_mask).sum(-1) / completion_mask.sum(-1).clamp(min=1.0)).mean() + elif self.loss_type == "bnpo": + loss = (per_token_loss * completion_mask).sum() / completion_mask.sum().clamp(min=1.0) + elif self.loss_type == "dr_grpo": + loss = (per_token_loss * completion_mask).sum() / (per_token_loss.size(0) * self.max_completion_length) + else: + raise ValueError(f"Unknown loss type: {self.loss_type}") + + # Log the metrics + mode = "train" if self.model.training else "eval" + + if self.beta != 0.0: + mean_kl = (per_token_kl * completion_mask).sum() / completion_mask.sum() + self._metrics[mode]["kl"].append(self.accelerator.gather(mean_kl).nanmean().item()) + + # Compute the clipped probability ratios + is_low_clipped = (coef_1 < 1 - self.epsilon_low) & (advantages.unsqueeze(1) < 0) + is_high_clipped = (coef_1 > 1 + self.epsilon_high) & (advantages.unsqueeze(1) > 0) + is_region_clipped = is_low_clipped | is_high_clipped + + low_clip = (is_low_clipped * completion_mask).sum() / completion_mask.sum() + high_clip = (is_high_clipped * completion_mask).sum() / completion_mask.sum() + clip_ratio = (is_region_clipped * completion_mask).sum() / completion_mask.sum() + + gathered_low_clip = self.accelerator.gather(low_clip) + self._metrics[mode]["clip_ratio/low_mean"].append(gathered_low_clip.nanmean().item()) + self._metrics[mode]["clip_ratio/low_min"].append(nanmin(gathered_low_clip).item()) + gathered_high_clip = self.accelerator.gather(high_clip) + self._metrics[mode]["clip_ratio/high_mean"].append(gathered_high_clip.nanmean().item()) + self._metrics[mode]["clip_ratio/high_max"].append(nanmax(gathered_high_clip).item()) + gathered_clip_ratio = self.accelerator.gather(clip_ratio) + self._metrics[mode]["clip_ratio/region_mean"].append(gathered_clip_ratio.nanmean().item()) + return loss + + def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys: Optional[list[str]] = None): + inputs = self._prepare_inputs(inputs) + with torch.no_grad(): + with self.compute_loss_context_manager(): + loss = self.compute_loss(model, inputs) + loss = loss.mean().detach() + return loss, None, None + + def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None: + mode = "train" if self.model.training else "eval" + metrics = {key: sum(val) / len(val) for key, val in self._metrics[mode].items()} # average the metrics + + # This method can be called both in training and evaluation. When called in evaluation, the keys in `logs` + # start with "eval_". We need to add the prefix "eval_" to the keys in `metrics` to match the format. + if mode == "eval": + metrics = {f"eval_{key}": val for key, val in metrics.items()} + + logs = {**logs, **metrics} + super().log(logs, start_time) + self._metrics[mode].clear() + + if self.accelerator.is_main_process and self.log_completions: + if is_rich_available(): + print_prompt_completions_sample( + self._textual_logs["prompt"], + self._textual_logs["completion"], + self._textual_logs["rewards"], + self._textual_logs["advantages"], + self.state.global_step, + self.num_completions_to_print, + ) + + if self.args.report_to and "wandb" in self.args.report_to and wandb.run is not None: + import pandas as pd + + table = { + "step": [str(self.state.global_step)] * len(self._textual_logs["prompt"]), + "prompt": self._textual_logs["prompt"], + "completion": self._textual_logs["completion"], + **self._textual_logs["rewards"], + "advantage": self._textual_logs["advantages"], + } + df = pd.DataFrame(table) + if self.wandb_log_unique_prompts: + df = df.drop_duplicates(subset=["prompt"]) + wandb.log({"completions": wandb.Table(dataframe=df)}) + + # Ensure the model card is saved along with the checkpoint + def _save_checkpoint(self, model, trial): + if self.args.hub_model_id is None: + model_name = Path(self.args.output_dir).name + else: + model_name = self.args.hub_model_id.split("/")[-1] + self.create_model_card(model_name=model_name) + super()._save_checkpoint(model, trial) + + def create_model_card( + self, + model_name: Optional[str] = None, + dataset_name: Optional[str] = None, + tags: Union[str, list[str], None] = None, + ): + """ + Creates a draft of a model card using the information available to the `Trainer`. + + Args: + model_name (`str` or `None`, *optional*, defaults to `None`): + Name of the model. + dataset_name (`str` or `None`, *optional*, defaults to `None`): + Name of the dataset used for training. + tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`): + Tags to be associated with the model card. + """ + if not self.is_world_process_zero(): + return + + if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path): + base_model = self.model.config._name_or_path + else: + base_model = None + + # normalize `tags` to a mutable set + if tags is None: + tags = set() + elif isinstance(tags, str): + tags = {tags} + else: + tags = set(tags) + + if hasattr(self.model.config, "unsloth_version"): + tags.add("unsloth") + + tags.update(self._tag_names) + + citation = textwrap.dedent( + """\ + @article{zhihong2024deepseekmath, + title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}}, + author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo}, + year = 2024, + eprint = {arXiv:2402.03300}, + } + """ + ) + + model_card = generate_model_card( + base_model=base_model, + model_name=model_name, + hub_model_id=self.hub_model_id, + dataset_name=dataset_name, + tags=tags, + wandb_url=wandb.run.get_url() if is_wandb_available() and wandb.run is not None else None, + comet_url=get_comet_experiment_url(), + trainer_name="GRPO", + trainer_citation=citation, + paper_title="DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models", + paper_id="2402.03300", + ) + + model_card.save(os.path.join(self.args.output_dir, "README.md")) +class UnslothGRPOTrainer(_UnslothGRPOTrainer): + """ + + Trainer for the Group Relative Policy Optimization (GRPO) method. This algorithm was initially proposed in the + paper [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language + Models](https://huggingface.co/papers/2402.03300). + + Example: + + ```python + from datasets import load_dataset + from trl import GRPOTrainer + + dataset = load_dataset("trl-lib/tldr", split="train") + def reward_func(completions, **kwargs): + # Dummy reward function that rewards completions with more unique letters. + return [float(len(set(completion))) for completion in completions] + trainer = GRPOTrainer( + model="Qwen/Qwen2-0.5B-Instruct", + reward_funcs=reward_func, + train_dataset=dataset, + ) + + trainer.train() + ``` + + Args: + model (`Union[str, PreTrainedModel]`): + Model to be trained. Can be either: + + - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a + path to a *directory* containing model weights saved using + [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded + using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in + `args.model_init_kwargs`. + - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported. + reward_funcs (`Union[RewardFunc, list[RewardFunc]]`): + Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward + functions with the prompts and completions and sum the rewards. Can be either: + + - A single reward function, such as: + - A string: The *model ID* of a pretrained model hosted inside a model repo on huggingface.co, or a + path to a *directory* containing model weights saved using + [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded + using [`~transformers.AutoModelForSequenceClassification.from_pretrained`] with `num_labels=1` and the + keyword arguments in `args.model_init_kwargs`. + - A [`~transformers.PreTrainedModel`] object: Only sequence classification models are supported. + - A custom reward function: The function is provided with the prompts and the generated completions, + plus any additional columns in the dataset. It should return a list of rewards. Custom reward + functions can also return None when the reward is not applicable to those samples. This is useful for + multi-task training where different reward functions apply to different types of samples. When a + reward function returns None for a sample, that reward function is excluded from the reward + calculation for that sample. For more details, see [Using a custom reward + function](#using-a-custom-reward-function). + - A list of reward functions, where each item can independently be any of the above types. Mixing different + types within the list (e.g., a string model ID and a custom reward function) is allowed. + args ([`GRPOConfig`], *optional*, defaults to `None`): + Configuration for this trainer. If `None`, a default configuration is used. + train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]): + Dataset to use for training. It must include a column `"prompt"`. Any additional columns in the dataset is + ignored. The format of the samples can be either: + + - [Standard](dataset_formats#standard): Each sample contains plain text. + - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role + and content). + eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`): + Dataset to use for evaluation. It must meet the same requirements as `train_dataset`. + processing_class ([`~transformers.PreTrainedTokenizerBase`], *optional*, defaults to `None`): + Processing class used to process the data. The padding side must be set to "left". If `None`, the + processing class is loaded from the model's name with [`~transformers.AutoTokenizer.from_pretrained`]. A + padding token, `processing_class.pad_token`, must be set. If the processing class has not set a padding + token, `processing_class.eos_token` will be used as the default. + reward_processing_classes (`Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]`, *optional*, defaults to `None`): + Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either: + + - A single processing class: Used when `reward_funcs` contains only one reward function. + - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`. + If set to `None`, or if an element of the list corresponding to a [`~transformers.PreTrainedModel`] is + `None`, the tokenizer for the model is automatically loaded using + [`~transformers.AutoTokenizer.from_pretrained`]. For elements in `reward_funcs` that are custom reward + functions (not [`~transformers.PreTrainedModel`]), the corresponding entries in `reward_processing_classes` + are ignored. + callbacks (list of [`~transformers.TrainerCallback`], *optional*, defaults to `None`): + List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed + in [here](https://huggingface.co/docs/transformers/main_classes/callback). + + If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`] + method. + optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`): + A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your + model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`. + peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`): + PEFT configuration used to wrap the model. If `None`, the model is not wrapped. + + """ + def __init__( + self, + model, + reward_funcs, + args = None, + train_dataset = None, + eval_dataset = None, + processing_class = None, + reward_processing_classes = None, + callbacks = None, + peft_config = None, + **kwargs + ): + if args is None: args = UnslothGRPOConfig() + use_bf16 = getattr(args, 'bf16', False) + if type(use_bf16) is not bool: use_bf16 = False + use_fp16 = getattr(args, 'fp16', False) + if type(use_fp16) is not bool: use_fp16 = False + force_float32 = False + if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1': + print('Unsloth: Switching to float32 training since model cannot work with float16') + force_float32 = True + mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') + dtype = getattr(model.config, 'torch_dtype', None) + if dtype is None: dtype = model.get_input_embeddings().dtype + from unsloth_zoo.utils import _get_dtype + dtype = _get_dtype(dtype) + float16 = dtype == torch.float16 + if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`') + if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`') + if force_float32: + args.fp16 = False + args.bf16 = False + os.environ['ACCELERATE_MIXED_PRECISION'] = 'no' + elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32': + args.fp16 = float16 + args.bf16 = not float16 + os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16' + if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no': + args.eval_strategy = 'steps' + if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1 + ga_steps = getattr(args, 'gradient_accumulation_steps', None) + if ga_steps is not None and ga_steps > 1: + from transformers import __version__ as transformers_version + if Version(transformers_version) <= Version('4.45.2'): + print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n' + '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`') + if getattr(args, 'eval_strategy', 'no') != 'no': + eval_bsz = getattr(args, 'per_device_eval_batch_size', 8) + if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size + if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps + fp16_full_eval = getattr(args, 'fp16_full_eval', False) + if type(fp16_full_eval) is not bool: fp16_full_eval = False + bf16_full_eval = getattr(args, 'bf16_full_eval', False) + if type(bf16_full_eval) is not bool: bf16_full_eval = False + if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True + if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False + if force_float32: + args.bf16_full_eval = False + args.fp16_full_eval = False + elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16': + args.bf16_full_eval = True + args.fp16_full_eval = False + elif not bf16_full_eval and not fp16_full_eval: + args.bf16_full_eval = args.bf16 + args.fp16_full_eval = args.fp16 + _output_logits = False + if locals().get('compute_metrics', None) is not None: _output_logits = True + if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True + if _output_logits: + os.environ['UNSLOTH_RETURN_LOGITS'] = '1' + if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'): + pass + else: + model_max_seq_length = getattr(model, 'max_seq_length', None) + args_max_seq_length = getattr(args, 'max_seq_length', None) + if args_max_seq_length is None and model_max_seq_length is not None: + max_seq_length = model.max_seq_length + if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length + if model is not None and hasattr(model, 'for_training'): + model.for_training() + if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right' + if 'processing_class' in locals(): + if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right' + if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right' + other_metrics = [] + if not isinstance(reward_funcs, list): _reward_funcs = [reward_funcs] + else: _reward_funcs = reward_funcs + for reward_func in _reward_funcs: + try: + reward_func_name = reward_func.__name__ + if True: + other_metrics.append(f'rewards/{reward_func_name}/mean') + if True: + other_metrics.append(f'rewards/{reward_func_name}/std') + if False: + other_metrics.append(f'rewards/{reward_func_name}') + except: pass + + from unsloth_zoo.logging_utils import PatchRLStatistics + PatchRLStatistics('grpo_trainer', other_metrics) + + super().__init__( + model = model, + reward_funcs = reward_funcs, + args = args, + train_dataset = train_dataset, + eval_dataset = eval_dataset, + processing_class = processing_class, + reward_processing_classes = reward_processing_classes, + callbacks = callbacks, + peft_config = peft_config,**kwargs) + if hasattr(self, 'neftune_hook_handle'): + self.neftune_hook_handle.remove() + if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle + if getattr(args, 'neftune_noise_alpha', None) is not None: + model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha + pass + +pass diff --git a/unsloth_compiled_cache/UnslothIterativeSFTTrainer.py b/unsloth_compiled_cache/UnslothIterativeSFTTrainer.py new file mode 100644 index 0000000000000000000000000000000000000000..f1682e202d77ad61da4560323fc2476aa4d91a8f --- /dev/null +++ b/unsloth_compiled_cache/UnslothIterativeSFTTrainer.py @@ -0,0 +1,947 @@ +""" +2025.7.4 +2025.7.3 +4.53.2 +0.19.1 +__UNSLOTH_VERSIONING__ +""" +from torch import Tensor +import torch +import torch.nn as nn +from torch.nn import functional as F +from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable +from trl.trainer.iterative_sft_trainer import (AutoModelForCausalLM, AutoTokenizer, BaseImageProcessor, Callable, DataCollator, DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, DataLoader, Dataset, EvalLoopOutput, FeatureExtractionMixin, IterativeSFTConfig, IterativeSFTTrainer, Optional, PPODecorators, Path, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, Trainer, TrainingArguments, Union, generate_model_card, get_comet_experiment_url, is_peft_available, is_wandb_available, os, torch, wandb, warnings, Optional, PeftModel, PreTrainedModel, Trainer, is_peft_available, os, torch) + + +import os +from typing import * +from dataclasses import dataclass, field +from packaging.version import Version +import torch +import numpy as np +from contextlib import nullcontext +from torch.nn import functional as F +from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling + +torch_compile_options = { + "epilogue_fusion" : True, + "max_autotune" : False, + "shape_padding" : True, + "trace.enabled" : False, + "triton.cudagraphs" : False, +} + +@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) +def selective_log_softmax(logits, index): + logits = logits.to(torch.float32) + selected_logits = torch.gather(logits, dim = -1, index = index.unsqueeze(-1)).squeeze(-1) + # loop to reduce peak mem consumption + # logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits]) + logsumexp_values = torch.logsumexp(logits, dim = -1) + per_token_logps = selected_logits - logsumexp_values # log_softmax(x_i) = x_i - logsumexp(x) + return per_token_logps +@dataclass +class UnslothIterativeSFTConfig(IterativeSFTConfig): + """ + + Configuration class for the [`IterativeSFTTrainer`]. + + This class includes only the parameters that are specific to Iterative SFT training. For a full list of training + arguments, please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this + class may differ from those in [`~transformers.TrainingArguments`]. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + > Parameters that control the model + + model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): + Keyword arguments for [`~transformers.AutoModelForCausalLM.from_pretrained`], used when the `model` + argument of the [`IterativeSFTTrainer`] is provided as a string. + + > Parameters that control the data preprocessing + + max_length (`int` or `None`, *optional*, defaults to `None`): + Maximum length of the tokenized sequence. Sequences longer than `max_length` are truncated. + truncation_mode (`str`, *optional*, defaults to `"keep_end"`): + The truncation mode to use, either `"keep_end"` or `"keep_start"`. + optimize_device_cache (`bool`, *optional*, defaults to `False`): + Whether to optimize accelerator cache for slightly more memory-efficient training. + + """ + vllm_sampling_params: Optional[Any] = field( + default = None, + metadata = {'help': 'vLLM SamplingParams'}, + ) + unsloth_num_chunks : Optional[int] = field( + default = -1, + metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'}, + ) + def __init__( + self, + output_dir = None, + overwrite_output_dir = None, + do_train = False, + do_eval = False, + do_predict = False, + eval_strategy = 'no', + prediction_loss_only = False, + per_device_train_batch_size = 4, + per_device_eval_batch_size = 4, + per_gpu_train_batch_size = None, + per_gpu_eval_batch_size = None, + gradient_accumulation_steps = 2, + eval_accumulation_steps = 2, + eval_delay = 0, + torch_empty_cache_steps = 250, + learning_rate = 5e-05, + weight_decay = 0.01, + adam_beta1 = 0.9, + adam_beta2 = 0.999, + adam_epsilon = 1e-08, + max_grad_norm = 1.0, + num_train_epochs = 3.0, + max_steps = -1, + lr_scheduler_type = 'linear', + warmup_ratio = 0.1, + warmup_steps = 0, + log_level = 'passive', + log_level_replica = 'warning', + log_on_each_node = True, + logging_dir = None, + logging_strategy = 'steps', + logging_first_step = False, + logging_steps = 1, + logging_nan_inf_filter = False, + save_strategy = 'steps', + save_steps = 500, + save_total_limit = None, + save_safetensors = True, + save_on_each_node = False, + save_only_model = False, + restore_callback_states_from_checkpoint = False, + no_cuda = False, + use_cpu = False, + use_mps_device = False, + seed = 3407, + data_seed = 3407, + jit_mode_eval = False, + use_ipex = False, + bf16 = False, + fp16 = False, + fp16_opt_level = 'O1', + half_precision_backend = 'auto', + bf16_full_eval = False, + fp16_full_eval = False, + tf32 = None, + local_rank = -1, + ddp_backend = None, + tpu_num_cores = None, + tpu_metrics_debug = False, + debug = '', + dataloader_drop_last = False, + eval_steps = None, + dataloader_num_workers = 0, + dataloader_prefetch_factor = None, + past_index = -1, + run_name = None, + disable_tqdm = None, + remove_unused_columns = True, + label_names = None, + load_best_model_at_end = False, + metric_for_best_model = None, + greater_is_better = None, + ignore_data_skip = False, + fsdp = '', + fsdp_min_num_params = 0, + fsdp_config = None, + fsdp_transformer_layer_cls_to_wrap = None, + accelerator_config = None, + deepspeed = None, + label_smoothing_factor = 0.0, + optim = 'adamw_8bit', + optim_args = None, + adafactor = False, + group_by_length = False, + length_column_name = 'length', + report_to = None, + ddp_find_unused_parameters = None, + ddp_bucket_cap_mb = None, + ddp_broadcast_buffers = None, + dataloader_pin_memory = True, + dataloader_persistent_workers = False, + skip_memory_metrics = True, + use_legacy_prediction_loop = False, + push_to_hub = False, + resume_from_checkpoint = None, + hub_model_id = None, + hub_strategy = 'every_save', + hub_token = None, + hub_private_repo = None, + hub_always_push = False, + hub_revision = None, + gradient_checkpointing = False, + gradient_checkpointing_kwargs = None, + include_inputs_for_metrics = False, + eval_do_concat_batches = True, + fp16_backend = 'auto', + push_to_hub_model_id = None, + push_to_hub_organization = None, + push_to_hub_token = None, + mp_parameters = '', + auto_find_batch_size = False, + full_determinism = False, + torchdynamo = None, + ray_scope = 'last', + ddp_timeout = 1800, + torch_compile = False, + torch_compile_backend = None, + torch_compile_mode = None, + include_tokens_per_second = False, + include_num_input_tokens_seen = False, + neftune_noise_alpha = None, + optim_target_modules = None, + batch_eval_metrics = False, + eval_on_start = False, + use_liger_kernel = False, + liger_kernel_config = None, + eval_use_gather_object = False, + average_tokens_across_devices = False, + model_init_kwargs = None, + max_length = None, + truncation_mode = 'keep_end', + optimize_device_cache = False, + vllm_sampling_params = None, + unsloth_num_chunks = -1, + **kwargs, + ): + if learning_rate < 1e-7: raise FloatingPointError(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!') + if learning_rate > 1: raise OverflowError(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!') + if output_dir is None and save_strategy == 'steps' and save_steps == 500: + output_dir = 'unsloth_training_checkpoints' + save_strategy = 'no' + + super().__init__( + output_dir = output_dir, + overwrite_output_dir = overwrite_output_dir, + do_train = do_train, + do_eval = do_eval, + do_predict = do_predict, + eval_strategy = eval_strategy, + prediction_loss_only = prediction_loss_only, + per_device_train_batch_size = per_device_train_batch_size, + per_device_eval_batch_size = per_device_eval_batch_size, + per_gpu_train_batch_size = per_gpu_train_batch_size, + per_gpu_eval_batch_size = per_gpu_eval_batch_size, + gradient_accumulation_steps = gradient_accumulation_steps, + eval_accumulation_steps = eval_accumulation_steps, + eval_delay = eval_delay, + torch_empty_cache_steps = torch_empty_cache_steps, + learning_rate = learning_rate, + weight_decay = weight_decay, + adam_beta1 = adam_beta1, + adam_beta2 = adam_beta2, + adam_epsilon = adam_epsilon, + max_grad_norm = max_grad_norm, + num_train_epochs = num_train_epochs, + max_steps = max_steps, + lr_scheduler_type = lr_scheduler_type, + warmup_ratio = warmup_ratio, + warmup_steps = warmup_steps, + log_level = log_level, + log_level_replica = log_level_replica, + log_on_each_node = log_on_each_node, + logging_dir = logging_dir, + logging_strategy = logging_strategy, + logging_first_step = logging_first_step, + logging_steps = logging_steps, + logging_nan_inf_filter = logging_nan_inf_filter, + save_strategy = save_strategy, + save_steps = save_steps, + save_total_limit = save_total_limit, + save_safetensors = save_safetensors, + save_on_each_node = save_on_each_node, + save_only_model = save_only_model, + restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint, + no_cuda = no_cuda, + use_cpu = use_cpu, + use_mps_device = use_mps_device, + seed = seed, + data_seed = data_seed, + jit_mode_eval = jit_mode_eval, + use_ipex = use_ipex, + bf16 = bf16, + fp16 = fp16, + fp16_opt_level = fp16_opt_level, + half_precision_backend = half_precision_backend, + bf16_full_eval = bf16_full_eval, + fp16_full_eval = fp16_full_eval, + tf32 = tf32, + local_rank = local_rank, + ddp_backend = ddp_backend, + tpu_num_cores = tpu_num_cores, + tpu_metrics_debug = tpu_metrics_debug, + debug = debug, + dataloader_drop_last = dataloader_drop_last, + eval_steps = eval_steps, + dataloader_num_workers = dataloader_num_workers, + dataloader_prefetch_factor = dataloader_prefetch_factor, + past_index = past_index, + run_name = run_name, + disable_tqdm = disable_tqdm, + remove_unused_columns = remove_unused_columns, + label_names = label_names, + load_best_model_at_end = load_best_model_at_end, + metric_for_best_model = metric_for_best_model, + greater_is_better = greater_is_better, + ignore_data_skip = ignore_data_skip, + fsdp = fsdp, + fsdp_min_num_params = fsdp_min_num_params, + fsdp_config = fsdp_config, + fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap, + accelerator_config = accelerator_config, + deepspeed = deepspeed, + label_smoothing_factor = label_smoothing_factor, + optim = optim, + optim_args = optim_args, + adafactor = adafactor, + group_by_length = group_by_length, + length_column_name = length_column_name, + report_to = report_to, + ddp_find_unused_parameters = ddp_find_unused_parameters, + ddp_bucket_cap_mb = ddp_bucket_cap_mb, + ddp_broadcast_buffers = ddp_broadcast_buffers, + dataloader_pin_memory = dataloader_pin_memory, + dataloader_persistent_workers = dataloader_persistent_workers, + skip_memory_metrics = skip_memory_metrics, + use_legacy_prediction_loop = use_legacy_prediction_loop, + push_to_hub = push_to_hub, + resume_from_checkpoint = resume_from_checkpoint, + hub_model_id = hub_model_id, + hub_strategy = hub_strategy, + hub_token = hub_token, + hub_private_repo = hub_private_repo, + hub_always_push = hub_always_push, + hub_revision = hub_revision, + gradient_checkpointing = gradient_checkpointing, + gradient_checkpointing_kwargs = gradient_checkpointing_kwargs, + include_inputs_for_metrics = include_inputs_for_metrics, + eval_do_concat_batches = eval_do_concat_batches, + fp16_backend = fp16_backend, + push_to_hub_model_id = push_to_hub_model_id, + push_to_hub_organization = push_to_hub_organization, + push_to_hub_token = push_to_hub_token, + mp_parameters = mp_parameters, + auto_find_batch_size = auto_find_batch_size, + full_determinism = full_determinism, + torchdynamo = torchdynamo, + ray_scope = ray_scope, + ddp_timeout = ddp_timeout, + torch_compile = torch_compile, + torch_compile_backend = torch_compile_backend, + torch_compile_mode = torch_compile_mode, + include_tokens_per_second = include_tokens_per_second, + include_num_input_tokens_seen = include_num_input_tokens_seen, + neftune_noise_alpha = neftune_noise_alpha, + optim_target_modules = optim_target_modules, + batch_eval_metrics = batch_eval_metrics, + eval_on_start = eval_on_start, + use_liger_kernel = use_liger_kernel, + liger_kernel_config = liger_kernel_config, + eval_use_gather_object = eval_use_gather_object, + average_tokens_across_devices = average_tokens_across_devices, + model_init_kwargs = model_init_kwargs, + max_length = max_length, + truncation_mode = truncation_mode, + optimize_device_cache = optimize_device_cache,**kwargs) + self.vllm_sampling_params = vllm_sampling_params + self.unsloth_num_chunks = unsloth_num_chunks +pass + +class _UnslothIterativeSFTTrainer(Trainer): + """""" + + _tag_names = ["trl", "iterative-sft"] + + def __init__( + self, + model: Union[str, PreTrainedModel], + args: Optional[Union[IterativeSFTConfig, TrainingArguments]] = None, + data_collator: Optional[DataCollator] = None, + eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None, + processing_class: Optional[ + Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin] + ] = None, + optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = ( + None, + None, + ), + preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, + compute_metrics: Optional[Callable[[EvalLoopOutput], dict]] = None, + # Deprecated parameters + max_length: Optional[int] = None, + truncation_mode: Optional[str] = None, + optimize_device_cache: Optional[bool] = None, + ): + # Handle deprecated parameters + deprecated_params = {} + if max_length is not None: + deprecated_params["max_length"] = max_length + warnings.warn( + "The `max_length` parameter is deprecated and will be removed in version 0.20. " + "Pass it through the `args` parameter using `IterativeSFTConfig(max_length=...)` instead.", + DeprecationWarning, + ) + if truncation_mode is not None: + deprecated_params["truncation_mode"] = truncation_mode + warnings.warn( + "The `truncation_mode` parameter is deprecated and will be removed in version 0.20. " + "Pass it through the `args` parameter using `IterativeSFTConfig(truncation_mode=...)` instead.", + DeprecationWarning, + ) + if optimize_device_cache is not None: + deprecated_params["optimize_device_cache"] = optimize_device_cache + warnings.warn( + "The `optimize_device_cache` parameter is deprecated and will be removed in version 0.20 " + "Pass it through the `args` parameter using `IterativeSFTConfig(optimize_device_cache=...)` instead.", + DeprecationWarning, + ) + + # Args + model_id = model if isinstance(model, str) else model.config._name_or_path + if args is None: + model_name = model_id.split("/")[-1] + args = IterativeSFTConfig(f"{model_name}-IterativeSFT") + elif isinstance(args, TrainingArguments) and not isinstance(args, IterativeSFTConfig): + dict_args = args.to_dict() + dict_args["hub_token"] = args.hub_token # to_dict hides the hub_token + dict_args.pop("push_to_hub_token") + args = IterativeSFTConfig(**dict_args) + + # Update args with deprecated parameters if provided + if deprecated_params: + for key, value in deprecated_params.items(): + setattr(args, key, value) + + # Handle the tokenizer + if processing_class is None: + processing_class = AutoTokenizer.from_pretrained(model_id) + + # Model + if args.model_init_kwargs is not None and not isinstance(model, str): + warnings.warn( + "You passed model_init_kwargs to the `IterativeSFTConfig`, but your model is already instantiated. " + "The `model_init_kwargs` will be ignored." + ) + if isinstance(model, str): + model = self._create_model_from_path(model, args) + + # PEFT configuration and model wrapping + if is_peft_available() and isinstance(model, PeftModel): + self.is_peft_model = True + else: + self.is_peft_model = False + + self.processing_class = processing_class + self.is_encoder_decoder = getattr(model.config, "is_encoder_decoder", False) + + if data_collator is None: + if self.is_encoder_decoder: + self.data_collator = DataCollatorForSeq2Seq( + processing_class, label_pad_token_id=-100, pad_to_multiple_of=8 + ) + else: + self.data_collator = DataCollatorForLanguageModeling(self.processing_class, mlm=False) + else: + self.data_collator = data_collator + + self.max_length = args.max_length + self.truncation_mode = args.truncation_mode + self.optimize_device_cache = args.optimize_device_cache + + super().__init__( + model=model, + args=args, + data_collator=self.data_collator, + eval_dataset=eval_dataset, + processing_class=processing_class, + compute_metrics=compute_metrics, + optimizers=optimizers, + preprocess_logits_for_metrics=preprocess_logits_for_metrics, + ) + + # Add tags for models that have been loaded with the correct transformers version + if hasattr(self.model, "add_model_tags"): + self.model.add_model_tags(self._tag_names) + + self.create_optimizer_and_scheduler(self.args.max_steps) + + # prepare model, optimizer and lr_scheduler + self.model, self.optimizer, self.lr_scheduler = self.accelerator.prepare( + self.model, self.optimizer, self.lr_scheduler + ) + + self.processing_class.truncation_side = "left" if self.truncation_mode == "keep_end" else "right" + + if not hasattr(self, "accelerator"): + raise AttributeError( + "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`." + ) + + PPODecorators.optimize_device_cache = self.optimize_device_cache + + def _create_model_from_path(self, model_path: str, args: IterativeSFTConfig) -> PreTrainedModel: + """Creates a model from a path or model identifier.""" + model_init_kwargs = args.model_init_kwargs or {} + return AutoModelForCausalLM.from_pretrained(model_path, **model_init_kwargs) + + def prepare_model_inputs(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, labels: torch.Tensor): + if attention_mask is None: + attention_mask = [torch.ones_like(ids) for ids in input_ids] + + if self.is_encoder_decoder: + input_data = self.data_collator( + [ + {"input_ids": ids, "attention_mask": att, "labels": lab} + for ids, att, lab in zip(input_ids, attention_mask, labels) + ] + ).to(self.model.device) + + input_data.pop("decoder_input_ids", None) # This is directly computed inside the model + + input_data["labels"][input_data["labels"] == self.processing_class.pad_token_id] = -100 + + else: + input_data = self.data_collator( + [{"input_ids": ids, "attention_mask": att} for ids, att in zip(input_ids, attention_mask)] + ).to(self.model.device) + + # truncate in case the user has provided input_ids, attention_mask and labels + if self.max_length is not None: + if self.truncation_mode == "keep_start": + input_data = {k: v[: self.max_length] for k, v in input_data.items()} + elif self.truncation_mode == "keep_end": + input_data = {k: v[-self.max_length :] for k, v in input_data.items()} + else: + raise ValueError(f"Unknown truncation mode: {self.truncation_mode}") + + return input_data + + @staticmethod + def _step_safety_checker( + input_ids: list[torch.LongTensor], + attention_mask: list[torch.LongTensor], + labels: list[torch.LongTensor], + texts: list[str], + texts_labels: list[str], + ): + """ + Check if the input data is valid for training. + + Args: + input_ids (list[`torch.LongTensor`]): + List of tensors containing the input_ids + attention_mask (list[`torch.LongTensor`]): + List of tensors containing the attention_mask + labels (list[`torch.FloatTensor`]): + List of tensors containing the labels + texts (list[`str`]): + List of string containing the text input. + texts_labels (list[`str`]): + List of string containing the text labels. + + Returns: + `tuple`: The input data. + """ + if texts is None: + if attention_mask is None: + for name, tensor_list in zip(["input_ids", "labels"], [input_ids, labels]): + if not isinstance(tensor_list, list): + raise ValueError(f"{name} must be a list of tensors - got {type(tensor_list)}") + if not isinstance(tensor_list[0], torch.Tensor): + raise ValueError(f"Elements in {name} must be tensors - got {type(tensor_list[0])}") + else: + for name, tensor_list in zip( + ["input_ids", "attention_mask", "labels"], [input_ids, attention_mask, labels] + ): + if not isinstance(tensor_list, list): + raise ValueError(f"{name} must be a list of tensors - got {type(tensor_list)}") + if not isinstance(tensor_list[0], torch.Tensor): + raise ValueError(f"Elements in {name} must be tensors - got {type(tensor_list[0])}") + else: + if not isinstance(texts, list): + raise ValueError(f"'text' must be a list of strings - got {type(texts)}") + if not isinstance(texts[0], str): + raise ValueError(f"Elements in 'text' must be strings - got {type(texts[0])}") + if texts_labels is not None: + if not isinstance(texts_labels, list): + raise ValueError(f"'text_labels' must be a list of strings - got {type(texts_labels)}") + if not isinstance(texts_labels[0], str): + raise ValueError(f"Elements in 'text_labels' must be strings - got {type(texts_labels[0])}") + + return input_ids, attention_mask, labels, texts, texts_labels + + @PPODecorators.empty_device_cache() + def step( + self, + input_ids: Optional[list[torch.LongTensor]] = None, + attention_mask: Optional[list[torch.LongTensor]] = None, + labels: Optional[list[torch.LongTensor]] = None, + texts: Optional[list[str]] = None, + texts_labels: Optional[list[str]] = None, + ): + """ + Run an optimisation step given a list of input_ids, attention_mask, and labels or a list of text and + text_labels. + + Args: + input_ids (list[`torch.LongTensor`]): + List of tensors containing the input_ids (if not provided, text will be used) + attention_mask (list[`torch.LongTensor`], , *optional*): + List of tensors containing the attention_mask + labels (list[`torch.FloatTensor`], *optional*): + List of tensors containing the labels (if set to None, will default to input_ids) + texts (list[`str`], *optional*): + List of strings containing the text input (if not provided, input_ids will directly be used) + texts_labels (list[`str`], *optional*): + List of strings containing the text labels (if set to None, will default to text) + + Returns: + `dict[str, Any]`: A summary of the training statistics + """ + self.model.train() + + if self.state.global_step == 0: + self.tr_loss = torch.tensor(0.0).to(self.args.device) + self._globalstep_last_logged = self.state.global_step + + if input_ids is None and texts is None: + raise ValueError("Step should include `input_ids` or `texts` as keyword arguments.") + elif input_ids is not None and texts is not None: + warnings.warn( + "Both `input_ids` and `texts` argument are provided. `input_ids` will be ignored. " + "Please provide only one of the two.", + UserWarning, + ) + + if labels is None and texts_labels is None and self.is_encoder_decoder: + raise ValueError( + "No 'labels' or 'text_labels' are provided. When using an encoder-decoder architecture, 'labels' or 'text_labels' must be passed." + ) + + # Convert Column to list if not already + input_ids = input_ids[:] if input_ids is not None else None + attention_mask = attention_mask[:] if attention_mask is not None else None + labels = labels[:] if labels is not None else None + texts = texts[:] if texts is not None else None + texts_labels = texts_labels[:] if texts_labels is not None else None + + input_ids, attention_mask, labels, texts, texts_labels = self._step_safety_checker( + input_ids, attention_mask, labels, texts, texts_labels + ) + + if texts is not None: + model_inputs = self.processing_class( + texts, max_length=self.max_length, truncation=True, padding=True, return_tensors="pt" + ) + + input_ids, attention_mask = model_inputs["input_ids"], model_inputs["attention_mask"] + + if texts_labels is not None: + labels = self.processing_class( + texts, max_length=self.max_length, truncation=True, padding=True, return_tensors="pt" + )["input_ids"] + + if labels is None: + labels = input_ids + + model_inputs = self.prepare_model_inputs(input_ids, attention_mask, labels) + + model_inputs_names = list(model_inputs.keys()) + + batch_dict = {} + batch_dict.update(model_inputs) + + def collator(data): + return_dict = dict() + for key in data[0]: + if key in ["input_ids", "attention_mask", "labels"]: + return_dict[key] = torch.stack([d[key] for d in data]).to(self.model.device) + return return_dict + + batch_data = Dataset.from_dict(batch_dict) + batch_data.set_format("torch") + + step_dataloader = DataLoader( + batch_data, + batch_size=self.args.per_device_train_batch_size, + shuffle=True, + collate_fn=collator, + ) + + for _, batch in enumerate(step_dataloader): + with self.accelerator.accumulate(self.model): + model_inputs = {k: batch[k] for k in model_inputs_names} + loss = self.compute_loss(self.model, model_inputs) + + if self.args.n_gpu > 1: + loss = loss.mean() + + tr_loss_step = loss.detach() + + self.accelerator.backward(loss) + + if self.accelerator.sync_gradients and self.args.max_grad_norm is not None: + self.accelerator.clip_grad_norm_( + self.model.parameters(), + self.args.max_grad_norm, + ) + + self.optimizer.step() + self.optimizer.zero_grad() + if self.lr_scheduler is not None: + self.lr_scheduler.step() + + self.state.global_step += 1 + + # update stats etc + self.tr_loss += tr_loss_step + + self._maybe_log_save_evaluate() + + def _maybe_log_save_evaluate(self): + # check if eval is required + if self.args.eval_steps is not None: + if self.state.global_step % self.args.eval_steps == 0 and self.state.global_step != 0: + self.evaluate(self.eval_dataset) + + # check if logging is required + if self.args.logging_steps is not None: + if self.state.global_step % self.args.logging_steps == 0 and self.state.global_step != 0: + logs: dict[str, float] = {} + + tr_loss_scalar = self._nested_gather(self.tr_loss).mean().item() + + # reset tr_loss to zero + self.tr_loss -= self.tr_loss + + logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4) + logs["learning_rate"] = self._get_learning_rate() + + self._globalstep_last_logged = self.state.global_step + + self.log(logs) + + # Ensure the model card is saved along with the checkpoint + def _save_checkpoint(self, model, trial): + if self.args.hub_model_id is None: + model_name = Path(self.args.output_dir).name + else: + model_name = self.args.hub_model_id.split("/")[-1] + self.create_model_card(model_name=model_name) + super()._save_checkpoint(model, trial) + + def create_model_card( + self, + model_name: Optional[str] = None, + dataset_name: Optional[str] = None, + tags: Union[str, list[str], None] = None, + ): + """ + Creates a draft of a model card using the information available to the `Trainer`. + + Args: + model_name (`str` or `None`, *optional*, defaults to `None`): + Name of the model. + dataset_name (`str` or `None`, *optional*, defaults to `None`): + Name of the dataset used for training. + tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`): + Tags to be associated with the model card. + """ + if not self.is_world_process_zero(): + return + + if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path): + base_model = self.model.config._name_or_path + else: + base_model = None + + # normalize `tags` to a mutable set + if tags is None: + tags = set() + elif isinstance(tags, str): + tags = {tags} + else: + tags = set(tags) + + if hasattr(self.model.config, "unsloth_version"): + tags.add("unsloth") + + tags.update(self._tag_names) + + model_card = generate_model_card( + base_model=base_model, + model_name=model_name, + hub_model_id=self.hub_model_id, + dataset_name=dataset_name, + tags=tags, + wandb_url=wandb.run.get_url() if is_wandb_available() and wandb.run is not None else None, + comet_url=get_comet_experiment_url(), + trainer_name="Iterative SFT", + ) + + model_card.save(os.path.join(self.args.output_dir, "README.md")) +class UnslothIterativeSFTTrainer(_UnslothIterativeSFTTrainer): + """ + + The IterativeSFTTrainer can be used to finetune models with methods that requires some steps between optimization. + + Args: + model (`Union[str, PreTrainedModel]`): + Model to be trained. Can be either: + + - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a + path to a *directory* containing model weights saved using + [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded + using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in + `args.model_init_kwargs`. + - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported. + args ([`IterativeSFTConfig`], *optional*, defaults to `None`): + Configuration for this trainer. If `None`, a default configuration is used. + data_collator (`DataCollator`, *optional*): + Function to use to form a batch from a list of elements of the processed `train_dataset` or `eval_dataset`. + Will default to [`~transformers.default_data_collator`] if no `processing_class` is provided, an instance + of [`~transformers.DataCollatorWithPadding`] otherwise if the processing_class is a feature extractor or + tokenizer. + eval_dataset (`datasets.Dataset`): + The dataset to use for evaluation. + processing_class ([`~transformers.PreTrainedTokenizerBase`], *optional*, defaults to `None`): + Processing class used to process the data. If `None`, the processing class is loaded from the model's name + with [`~transformers.AutoTokenizer.from_pretrained`]. + optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`): + The optimizer and scheduler to use for training. + preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`): + The function to use to preprocess the logits before computing the metrics. + compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*): + The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to + metric values. + max_length (`int`, *optional*, deprecated): + Maximum length of the tokenized sequence. Use `args.max_length` instead. + truncation_mode (`str`, *optional*, deprecated): + The truncation mode to use. Use `args.truncation_mode` instead. + optimize_device_cache (`bool`, *optional*, deprecated): + Whether to optimize accelerator cache. Use `args.optimize_device_cache` instead. + + """ + def __init__( + self, + model, + args = None, + data_collator = None, + eval_dataset = None, + processing_class = None, + preprocess_logits_for_metrics = None, + compute_metrics = None, + max_length = None, + truncation_mode = None, + optimize_device_cache = None, + **kwargs + ): + if args is None: args = UnslothIterativeSFTConfig() + use_bf16 = getattr(args, 'bf16', False) + if type(use_bf16) is not bool: use_bf16 = False + use_fp16 = getattr(args, 'fp16', False) + if type(use_fp16) is not bool: use_fp16 = False + force_float32 = False + if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1': + print('Unsloth: Switching to float32 training since model cannot work with float16') + force_float32 = True + mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') + dtype = getattr(model.config, 'torch_dtype', None) + if dtype is None: dtype = model.get_input_embeddings().dtype + from unsloth_zoo.utils import _get_dtype + dtype = _get_dtype(dtype) + float16 = dtype == torch.float16 + if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`') + if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`') + if force_float32: + args.fp16 = False + args.bf16 = False + os.environ['ACCELERATE_MIXED_PRECISION'] = 'no' + elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32': + args.fp16 = float16 + args.bf16 = not float16 + os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16' + if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no': + args.eval_strategy = 'steps' + if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1 + ga_steps = getattr(args, 'gradient_accumulation_steps', None) + if ga_steps is not None and ga_steps > 1: + from transformers import __version__ as transformers_version + if Version(transformers_version) <= Version('4.45.2'): + print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n' + '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`') + if getattr(args, 'eval_strategy', 'no') != 'no': + eval_bsz = getattr(args, 'per_device_eval_batch_size', 8) + if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size + if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps + fp16_full_eval = getattr(args, 'fp16_full_eval', False) + if type(fp16_full_eval) is not bool: fp16_full_eval = False + bf16_full_eval = getattr(args, 'bf16_full_eval', False) + if type(bf16_full_eval) is not bool: bf16_full_eval = False + if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True + if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False + if force_float32: + args.bf16_full_eval = False + args.fp16_full_eval = False + elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16': + args.bf16_full_eval = True + args.fp16_full_eval = False + elif not bf16_full_eval and not fp16_full_eval: + args.bf16_full_eval = args.bf16 + args.fp16_full_eval = args.fp16 + _output_logits = False + if locals().get('compute_metrics', None) is not None: _output_logits = True + if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True + if _output_logits: + os.environ['UNSLOTH_RETURN_LOGITS'] = '1' + if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'): + pass + else: + model_max_seq_length = getattr(model, 'max_seq_length', None) + args_max_seq_length = getattr(args, 'max_seq_length', None) + if args_max_seq_length is None and model_max_seq_length is not None: + max_seq_length = model.max_seq_length + if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length + if model is not None and hasattr(model, 'for_training'): + model.for_training() + if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right' + if 'processing_class' in locals(): + if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right' + if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right' + other_metrics = [] + + from unsloth_zoo.logging_utils import PatchRLStatistics + PatchRLStatistics('iterative_sft_trainer', other_metrics) + + super().__init__( + model = model, + args = args, + data_collator = data_collator, + eval_dataset = eval_dataset, + processing_class = processing_class, + preprocess_logits_for_metrics = preprocess_logits_for_metrics, + compute_metrics = compute_metrics, + max_length = max_length, + truncation_mode = truncation_mode, + optimize_device_cache = optimize_device_cache,**kwargs) + if hasattr(self, 'neftune_hook_handle'): + self.neftune_hook_handle.remove() + if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle + if getattr(args, 'neftune_noise_alpha', None) is not None: + model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha + pass + +pass diff --git a/unsloth_compiled_cache/UnslothKTOTrainer.py b/unsloth_compiled_cache/UnslothKTOTrainer.py new file mode 100644 index 0000000000000000000000000000000000000000..5a3f7b71f6689813b1eb2ab403434a2caec91945 --- /dev/null +++ b/unsloth_compiled_cache/UnslothKTOTrainer.py @@ -0,0 +1,2034 @@ +""" +2025.7.4 +2025.7.3 +4.53.2 +0.19.1 +__UNSLOTH_VERSIONING__ +""" +from torch import Tensor +import torch +import torch.nn as nn +from torch.nn import functional as F +from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable +from trl.trainer.kto_trainer import (Any, AutoModelForCausalLM, BaseImageProcessor, Callable, DPODataCollatorWithPadding, DataCollator, DataLoader, Dataset, EvalLoopOutput, F, FeatureExtractionMixin, KTOConfig, KTOTrainer, Literal, Optional, PartialState, Path, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, SequentialSampler, Trainer, TrainerCallback, TrainingArguments, Union, _get_kl_dataset, _process_tokens, _tokenize, autocast, concatenate_datasets, contextmanager, create_reference_model, defaultdict, disable_dropout_in_model, generate_model_card, get_comet_experiment_url, has_length, inspect, is_comet_available, is_liger_kernel_available, is_peft_available, is_wandb_available, itemgetter, log_table_to_comet_experiment, maybe_apply_chat_template, maybe_extract_prompt, maybe_unpair_preference_dataset, nn, np, nullcontext, os, pad_to_length, pd, peft_module_casting_to_bf16, prepare_deepspeed, prepare_model_for_kbit_training, random, textwrap, torch, tqdm, wandb, warnings, F, Optional, PeftModel, PreTrainedModel, Trainer, is_peft_available, os, torch) + + +import os +from typing import * +from dataclasses import dataclass, field +from packaging.version import Version +import torch +import numpy as np +from contextlib import nullcontext +from torch.nn import functional as F +from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling + +torch_compile_options = { + "epilogue_fusion" : True, + "max_autotune" : False, + "shape_padding" : True, + "trace.enabled" : False, + "triton.cudagraphs" : False, +} + +@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) +def selective_log_softmax(logits, index): + logits = logits.to(torch.float32) + selected_logits = torch.gather(logits, dim = -1, index = index.unsqueeze(-1)).squeeze(-1) + # loop to reduce peak mem consumption + # logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits]) + logsumexp_values = torch.logsumexp(logits, dim = -1) + per_token_logps = selected_logits - logsumexp_values # log_softmax(x_i) = x_i - logsumexp(x) + return per_token_logps +@dataclass +class UnslothKTOConfig(KTOConfig): + """ + + Configuration class for the [`KTOTrainer`]. + + This class includes only the parameters that are specific to KTO training. For a full list of training arguments, + please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may + differ from those in [`~transformers.TrainingArguments`]. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + max_length (`int` or `None`, *optional*, defaults to `1024`): + Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want + to use the default data collator. + max_prompt_length (`int` or `None`, *optional*, defaults to `512`): + Maximum length of the prompt. This argument is required if you want to use the default data collator. + max_completion_length (`int` or `None`, *optional*, defaults to `None`): + Maximum length of the completion. This argument is required if you want to use the default data collator + and your model is an encoder-decoder. + beta (`float`, *optional*, defaults to `0.1`): + Parameter controlling the deviation from the reference model. Higher β means less deviation from the + reference model. + loss_type (`str`, *optional*, defaults to `"kto"`): + Type of loss to use. Possible values are: + + - `"kto"`: KTO loss from the [KTO](https://huggingface.co/papers/2402.01306) paper. + - `"apo_zero_unpaired"`: Unpaired variant of APO-zero loss from the + [APO](https://huggingface.co/papers/2408.06266) paper. + + desirable_weight (`float`, *optional*, defaults to `1.0`): + Desirable losses are weighed by this factor to counter unequal number of desirable and undesirable paris. + undesirable_weight (`float`, *optional*, defaults to `1.0`): + Undesirable losses are weighed by this factor to counter unequal number of desirable and undesirable pairs. + label_pad_token_id (`int`, *optional*, defaults to `-100`): + Label pad token id. This argument is required if you want to use the default data collator. + padding_value (`int` or `None`, *optional*, defaults to `None`): + Padding value to use. If `None`, the padding value of the tokenizer is used. + truncation_mode (`str`, *optional*, defaults to `"keep_end"`): + Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`. + This argument is required if you want to use the default data collator. + generate_during_eval (`bool`, *optional*, defaults to `False`): + If `True`, generates and logs completions from both the model and the reference model to W&B or Comet + during evaluation. + is_encoder_decoder (`bool` or `None`, *optional*, defaults to `None`): + When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument, + you need to specify if the model returned by the callable is an encoder-decoder model. + precompute_ref_log_probs (`bool`, *optional*, defaults to `False`): + Whether to precompute reference model log probabilities for training and evaluation datasets. This is + useful when training without the reference model to reduce the total GPU memory needed. + model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): + Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a + string. + ref_model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): + Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the reference model + from a string. + dataset_num_proc: (`int` or `None`, *optional*, defaults to `None`): + Number of processes to use for processing the dataset. + disable_dropout (`bool`, *optional*, defaults to `True`): + Whether to disable dropout in the model and reference model. + use_liger_loss (`bool`, *optional*, defaults to `False`): + Whether to use Liger loss. It requires liger-kernel to be installed. + base_model_attribute_name (`str`, *optional*, defaults to `"model"`): + Name of the attribute in the model that contains the base model. This is used to get the base model from + the model when the model does not have a `get_decoder` method in the case when `use_liger_loss` is `True`. + + """ + vllm_sampling_params: Optional[Any] = field( + default = None, + metadata = {'help': 'vLLM SamplingParams'}, + ) + unsloth_num_chunks : Optional[int] = field( + default = -1, + metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'}, + ) + def __init__( + self, + output_dir = None, + overwrite_output_dir = None, + do_train = False, + do_eval = False, + do_predict = False, + eval_strategy = 'no', + prediction_loss_only = False, + per_device_train_batch_size = 4, + per_device_eval_batch_size = 4, + per_gpu_train_batch_size = None, + per_gpu_eval_batch_size = None, + gradient_accumulation_steps = 2, + eval_accumulation_steps = 2, + eval_delay = 0, + torch_empty_cache_steps = 250, + learning_rate = 5e-05, + weight_decay = 0.01, + adam_beta1 = 0.9, + adam_beta2 = 0.999, + adam_epsilon = 1e-08, + max_grad_norm = 1.0, + num_train_epochs = 3.0, + max_steps = -1, + lr_scheduler_type = 'linear', + warmup_ratio = 0.1, + warmup_steps = 0, + log_level = 'passive', + log_level_replica = 'warning', + log_on_each_node = True, + logging_dir = None, + logging_strategy = 'steps', + logging_first_step = False, + logging_steps = 1, + logging_nan_inf_filter = False, + save_strategy = 'steps', + save_steps = 500, + save_total_limit = None, + save_safetensors = True, + save_on_each_node = False, + save_only_model = False, + restore_callback_states_from_checkpoint = False, + no_cuda = False, + use_cpu = False, + use_mps_device = False, + seed = 3407, + data_seed = 3407, + jit_mode_eval = False, + use_ipex = False, + bf16 = False, + fp16 = False, + fp16_opt_level = 'O1', + half_precision_backend = 'auto', + bf16_full_eval = False, + fp16_full_eval = False, + tf32 = None, + local_rank = -1, + ddp_backend = None, + tpu_num_cores = None, + tpu_metrics_debug = False, + debug = '', + dataloader_drop_last = False, + eval_steps = None, + dataloader_num_workers = 0, + dataloader_prefetch_factor = None, + past_index = -1, + run_name = None, + disable_tqdm = None, + remove_unused_columns = True, + label_names = None, + load_best_model_at_end = False, + metric_for_best_model = None, + greater_is_better = None, + ignore_data_skip = False, + fsdp = '', + fsdp_min_num_params = 0, + fsdp_config = None, + fsdp_transformer_layer_cls_to_wrap = None, + accelerator_config = None, + deepspeed = None, + label_smoothing_factor = 0.0, + optim = 'adamw_8bit', + optim_args = None, + adafactor = False, + group_by_length = False, + length_column_name = 'length', + report_to = None, + ddp_find_unused_parameters = None, + ddp_bucket_cap_mb = None, + ddp_broadcast_buffers = None, + dataloader_pin_memory = True, + dataloader_persistent_workers = False, + skip_memory_metrics = True, + use_legacy_prediction_loop = False, + push_to_hub = False, + resume_from_checkpoint = None, + hub_model_id = None, + hub_strategy = 'every_save', + hub_token = None, + hub_private_repo = None, + hub_always_push = False, + hub_revision = None, + gradient_checkpointing = False, + gradient_checkpointing_kwargs = None, + include_inputs_for_metrics = False, + eval_do_concat_batches = True, + fp16_backend = 'auto', + push_to_hub_model_id = None, + push_to_hub_organization = None, + push_to_hub_token = None, + mp_parameters = '', + auto_find_batch_size = False, + full_determinism = False, + torchdynamo = None, + ray_scope = 'last', + ddp_timeout = 1800, + torch_compile = False, + torch_compile_backend = None, + torch_compile_mode = None, + include_tokens_per_second = False, + include_num_input_tokens_seen = False, + neftune_noise_alpha = None, + optim_target_modules = None, + batch_eval_metrics = False, + eval_on_start = False, + use_liger_kernel = False, + liger_kernel_config = None, + eval_use_gather_object = False, + average_tokens_across_devices = False, + max_length = 1024, + max_prompt_length = 512, + max_completion_length = None, + beta = 0.1, + loss_type = 'kto', + desirable_weight = 1.0, + undesirable_weight = 1.0, + label_pad_token_id = -100, + padding_value = None, + truncation_mode = 'keep_end', + generate_during_eval = False, + is_encoder_decoder = None, + disable_dropout = True, + precompute_ref_log_probs = False, + model_init_kwargs = None, + ref_model_init_kwargs = None, + dataset_num_proc = None, + use_liger_loss = False, + base_model_attribute_name = 'model', + vllm_sampling_params = None, + unsloth_num_chunks = -1, + **kwargs, + ): + if learning_rate < 1e-7: raise FloatingPointError(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!') + if learning_rate > 1: raise OverflowError(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!') + if output_dir is None and save_strategy == 'steps' and save_steps == 500: + output_dir = 'unsloth_training_checkpoints' + save_strategy = 'no' + if dataset_num_proc is None: + from multiprocessing import cpu_count + dataset_num_proc = cpu_count() + + super().__init__( + output_dir = output_dir, + overwrite_output_dir = overwrite_output_dir, + do_train = do_train, + do_eval = do_eval, + do_predict = do_predict, + eval_strategy = eval_strategy, + prediction_loss_only = prediction_loss_only, + per_device_train_batch_size = per_device_train_batch_size, + per_device_eval_batch_size = per_device_eval_batch_size, + per_gpu_train_batch_size = per_gpu_train_batch_size, + per_gpu_eval_batch_size = per_gpu_eval_batch_size, + gradient_accumulation_steps = gradient_accumulation_steps, + eval_accumulation_steps = eval_accumulation_steps, + eval_delay = eval_delay, + torch_empty_cache_steps = torch_empty_cache_steps, + learning_rate = learning_rate, + weight_decay = weight_decay, + adam_beta1 = adam_beta1, + adam_beta2 = adam_beta2, + adam_epsilon = adam_epsilon, + max_grad_norm = max_grad_norm, + num_train_epochs = num_train_epochs, + max_steps = max_steps, + lr_scheduler_type = lr_scheduler_type, + warmup_ratio = warmup_ratio, + warmup_steps = warmup_steps, + log_level = log_level, + log_level_replica = log_level_replica, + log_on_each_node = log_on_each_node, + logging_dir = logging_dir, + logging_strategy = logging_strategy, + logging_first_step = logging_first_step, + logging_steps = logging_steps, + logging_nan_inf_filter = logging_nan_inf_filter, + save_strategy = save_strategy, + save_steps = save_steps, + save_total_limit = save_total_limit, + save_safetensors = save_safetensors, + save_on_each_node = save_on_each_node, + save_only_model = save_only_model, + restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint, + no_cuda = no_cuda, + use_cpu = use_cpu, + use_mps_device = use_mps_device, + seed = seed, + data_seed = data_seed, + jit_mode_eval = jit_mode_eval, + use_ipex = use_ipex, + bf16 = bf16, + fp16 = fp16, + fp16_opt_level = fp16_opt_level, + half_precision_backend = half_precision_backend, + bf16_full_eval = bf16_full_eval, + fp16_full_eval = fp16_full_eval, + tf32 = tf32, + local_rank = local_rank, + ddp_backend = ddp_backend, + tpu_num_cores = tpu_num_cores, + tpu_metrics_debug = tpu_metrics_debug, + debug = debug, + dataloader_drop_last = dataloader_drop_last, + eval_steps = eval_steps, + dataloader_num_workers = dataloader_num_workers, + dataloader_prefetch_factor = dataloader_prefetch_factor, + past_index = past_index, + run_name = run_name, + disable_tqdm = disable_tqdm, + remove_unused_columns = remove_unused_columns, + label_names = label_names, + load_best_model_at_end = load_best_model_at_end, + metric_for_best_model = metric_for_best_model, + greater_is_better = greater_is_better, + ignore_data_skip = ignore_data_skip, + fsdp = fsdp, + fsdp_min_num_params = fsdp_min_num_params, + fsdp_config = fsdp_config, + fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap, + accelerator_config = accelerator_config, + deepspeed = deepspeed, + label_smoothing_factor = label_smoothing_factor, + optim = optim, + optim_args = optim_args, + adafactor = adafactor, + group_by_length = group_by_length, + length_column_name = length_column_name, + report_to = report_to, + ddp_find_unused_parameters = ddp_find_unused_parameters, + ddp_bucket_cap_mb = ddp_bucket_cap_mb, + ddp_broadcast_buffers = ddp_broadcast_buffers, + dataloader_pin_memory = dataloader_pin_memory, + dataloader_persistent_workers = dataloader_persistent_workers, + skip_memory_metrics = skip_memory_metrics, + use_legacy_prediction_loop = use_legacy_prediction_loop, + push_to_hub = push_to_hub, + resume_from_checkpoint = resume_from_checkpoint, + hub_model_id = hub_model_id, + hub_strategy = hub_strategy, + hub_token = hub_token, + hub_private_repo = hub_private_repo, + hub_always_push = hub_always_push, + hub_revision = hub_revision, + gradient_checkpointing = gradient_checkpointing, + gradient_checkpointing_kwargs = gradient_checkpointing_kwargs, + include_inputs_for_metrics = include_inputs_for_metrics, + eval_do_concat_batches = eval_do_concat_batches, + fp16_backend = fp16_backend, + push_to_hub_model_id = push_to_hub_model_id, + push_to_hub_organization = push_to_hub_organization, + push_to_hub_token = push_to_hub_token, + mp_parameters = mp_parameters, + auto_find_batch_size = auto_find_batch_size, + full_determinism = full_determinism, + torchdynamo = torchdynamo, + ray_scope = ray_scope, + ddp_timeout = ddp_timeout, + torch_compile = torch_compile, + torch_compile_backend = torch_compile_backend, + torch_compile_mode = torch_compile_mode, + include_tokens_per_second = include_tokens_per_second, + include_num_input_tokens_seen = include_num_input_tokens_seen, + neftune_noise_alpha = neftune_noise_alpha, + optim_target_modules = optim_target_modules, + batch_eval_metrics = batch_eval_metrics, + eval_on_start = eval_on_start, + use_liger_kernel = use_liger_kernel, + liger_kernel_config = liger_kernel_config, + eval_use_gather_object = eval_use_gather_object, + average_tokens_across_devices = average_tokens_across_devices, + max_length = max_length, + max_prompt_length = max_prompt_length, + max_completion_length = max_completion_length, + beta = beta, + loss_type = loss_type, + desirable_weight = desirable_weight, + undesirable_weight = undesirable_weight, + label_pad_token_id = label_pad_token_id, + padding_value = padding_value, + truncation_mode = truncation_mode, + generate_during_eval = generate_during_eval, + is_encoder_decoder = is_encoder_decoder, + disable_dropout = disable_dropout, + precompute_ref_log_probs = precompute_ref_log_probs, + model_init_kwargs = model_init_kwargs, + ref_model_init_kwargs = ref_model_init_kwargs, + dataset_num_proc = dataset_num_proc, + use_liger_loss = use_liger_loss, + base_model_attribute_name = base_model_attribute_name,**kwargs) + self.vllm_sampling_params = vllm_sampling_params + self.unsloth_num_chunks = unsloth_num_chunks +pass + +class _UnslothKTOTrainer(Trainer): + r"""""" + + _tag_names = ["trl", "kto"] + + def __init__( + self, + model: Union[PreTrainedModel, nn.Module, str] = None, + ref_model: Optional[Union[PreTrainedModel, nn.Module, str]] = None, + args: KTOConfig = None, + train_dataset: Optional[Dataset] = None, + eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None, + processing_class: Optional[ + Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin] + ] = None, + data_collator: Optional[DataCollator] = None, + model_init: Optional[Callable[[], PreTrainedModel]] = None, + callbacks: Optional[list[TrainerCallback]] = None, + optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), + preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, + peft_config: Optional[dict] = None, + compute_metrics: Optional[Callable[[EvalLoopOutput], dict]] = None, + model_adapter_name: Optional[str] = None, + ref_adapter_name: Optional[str] = None, + ): + if type(args) is TrainingArguments: + raise ValueError("Please use `KTOConfig` instead TrainingArguments.") + + if not isinstance(model, str) and ref_model is model: + raise ValueError( + "`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the " + "same as `model`, you must mass a copy of it, or `None` if you use peft." + ) + + if args.model_init_kwargs is None: + model_init_kwargs = {} + elif not isinstance(model, str): + raise ValueError("You passed model_kwargs to the KTOTrainer. But your model is already instantiated.") + else: + model_init_kwargs = args.model_init_kwargs + torch_dtype = model_init_kwargs.get("torch_dtype") + if torch_dtype is not None: + # Convert to `torch.dtype` if an str is passed + if isinstance(torch_dtype, str) and torch_dtype != "auto": + torch_dtype = getattr(torch, torch_dtype) + if torch_dtype != "auto" and not isinstance(torch_dtype, torch.dtype): + raise ValueError( + f"Invalid `torch_dtype` passed to the KTOConfig. Expected a string with either `torch.dtype` or 'auto', but got {torch_dtype}." + ) + model_init_kwargs["torch_dtype"] = torch_dtype + + if args.ref_model_init_kwargs is None: + ref_model_init_kwargs = {} + elif not isinstance(ref_model, str): + raise ValueError( + "You passed ref_model_kwargs to the KTOTrainer. But your ref_model is already instantiated." + ) + else: + ref_model_init_kwargs = args.ref_model_init_kwargs + torch_dtype = ref_model_init_kwargs.get("torch_dtype") + if torch_dtype is not None: + # Convert to `torch.dtype` if an str is passed + if isinstance(torch_dtype, str) and torch_dtype != "auto": + torch_dtype = getattr(torch, torch_dtype) + if torch_dtype != "auto" and not isinstance(torch_dtype, torch.dtype): + raise ValueError( + f"Invalid `torch_dtype` passed to the KTOConfig. Expected a string with either `torch.dtype` or 'auto', but got {torch_dtype}." + ) + ref_model_init_kwargs["torch_dtype"] = torch_dtype + + if isinstance(model, str): + model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs) + + if isinstance(ref_model, str): + ref_model = AutoModelForCausalLM.from_pretrained(ref_model, **ref_model_init_kwargs) + + # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16` + # has been called in order to properly call autocast if needed. + self._peft_has_been_casted_to_bf16 = False + + if not is_peft_available() and peft_config is not None: + raise ValueError( + "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it with `pip install peft` to use the PEFT models" + ) + elif is_peft_available() and peft_config is not None: + # if model is a peft model and we have a peft_config, we merge and unload it first + if isinstance(model, PeftModel): + model = model.merge_and_unload() + + if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False): + _support_gc_kwargs = hasattr( + args, "gradient_checkpointing_kwargs" + ) and "gradient_checkpointing_kwargs" in list( + inspect.signature(prepare_model_for_kbit_training).parameters + ) + + prepare_model_kwargs = {"use_gradient_checkpointing": args.gradient_checkpointing} + + if _support_gc_kwargs: + prepare_model_kwargs["gradient_checkpointing_kwargs"] = args.gradient_checkpointing_kwargs + + model = prepare_model_for_kbit_training(model, **prepare_model_kwargs) + elif args.gradient_checkpointing: + # For backward compatibility with older versions of transformers + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + else: + + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + # get peft model with the given config + model = model + if args.bf16 and getattr(model, "is_loaded_in_4bit", False): + peft_module_casting_to_bf16(model) + # If args.bf16 we need to explicitly call `generate` with torch amp autocast context manager + self._peft_has_been_casted_to_bf16 = True + + # For models that use gradient_checkpointing, we need to attach a hook that enables input + # to explicitly have `requires_grad=True`, otherwise training will either silently + # fail or completely fail. + elif args.gradient_checkpointing: + # For backward compatibility with older versions of transformers + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + else: + + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + if args.generate_during_eval and not (is_wandb_available() or is_comet_available()): + raise ValueError( + "`generate_during_eval=True` requires Weights and Biases or Comet to be installed." + " Please install `wandb` or `comet-ml` to resolve." + ) + + if model is not None: + self.is_encoder_decoder = model.config.is_encoder_decoder + elif args.is_encoder_decoder is None: + raise ValueError("When no model is provided, you need to pass the parameter is_encoder_decoder.") + else: + self.is_encoder_decoder = args.is_encoder_decoder + + self.is_peft_model = is_peft_available() and isinstance(model, PeftModel) + self.model_adapter_name = model_adapter_name + self.ref_adapter_name = ref_adapter_name + + if ref_model: + self.ref_model = ref_model + elif self.is_peft_model or args.precompute_ref_log_probs: + # The `model` with adapters turned off will be used as the reference model + self.ref_model = None + else: + self.ref_model = create_reference_model(model) + + if processing_class is None: + raise ValueError( + "max_length or a processing_class must be specified when using the default DPODataCollatorWithPadding" + ) + if args.max_length is None: + warnings.warn( + "When using DPODataCollatorWithPadding, you should set `max_length` in the KTOTrainer's init" + " it will be set to `512` by default, but you should do it yourself in the future.", + UserWarning, + ) + max_length = 512 + if args.max_length is not None: + max_length = args.max_length + + if args.max_prompt_length is None: + warnings.warn( + "When using DPODataCollatorWithPadding, you should set `max_prompt_length` in the KTOTrainer's init" + " it will be set to `128` by default, but you should do it yourself in the future.", + UserWarning, + ) + max_prompt_length = 128 + if args.max_prompt_length is not None: + max_prompt_length = args.max_prompt_length + + max_completion_length = None + if args.max_completion_length is None and self.is_encoder_decoder: + warnings.warn( + "When using DPODataCollatorWithPadding with an encoder decoder architecture, you should set `max_completion_length` in the KTOTrainer's init" + " it will be set to `128` by default, but you should do it yourself in the future.", + UserWarning, + ) + max_completion_length = 128 + if args.max_completion_length is not None and self.is_encoder_decoder: + max_completion_length = args.max_completion_length + + if data_collator is None: + data_collator = DPODataCollatorWithPadding( + pad_token_id=processing_class.pad_token_id, + label_pad_token_id=args.label_pad_token_id, + is_encoder_decoder=self.is_encoder_decoder, + ) + + if args.remove_unused_columns: + args.remove_unused_columns = False + # warn users + warnings.warn( + "When using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your KTOConfig" + " we have set it for you, but you should do it yourself in the future.", + UserWarning, + ) + + self.use_dpo_data_collator = True + else: + self.use_dpo_data_collator = False + + # Disable dropout in the model and reference model + if args.disable_dropout: + disable_dropout_in_model(model) + if self.ref_model is not None: + disable_dropout_in_model(self.ref_model) + + self.loss_type = args.loss_type + self.max_length = max_length + self.generate_during_eval = args.generate_during_eval + self.label_pad_token_id = args.label_pad_token_id + self.padding_value = args.padding_value if args.padding_value is not None else processing_class.pad_token_id + self.max_prompt_length = max_prompt_length + self.truncation_mode = args.truncation_mode + self.max_completion_length = max_completion_length + self.processing_class = processing_class + self.precompute_ref_log_probs = args.precompute_ref_log_probs + + # Not all losses require a KL calculation + self.calculate_KL = True + if self.loss_type in ["apo_zero_unpaired"]: + self.calculate_KL = False + + # Since ref_logs are precomputed on the first call to get_train/eval_dataloader + # keep track of first called to avoid computation of future calls + self._precomputed_train_ref_log_probs = False + self._precomputed_eval_ref_log_probs = False + + # metric + self._stored_metrics = defaultdict(lambda: defaultdict(list)) + + # KTO parameter + self.beta = args.beta + self.desirable_weight = args.desirable_weight + self.undesirable_weight = args.undesirable_weight + self.aux_loss_enabled = getattr(model.config, "output_router_logits", False) + self.aux_loss_coef = getattr(model.config, "router_aux_loss_coef", 0.0) + if self.aux_loss_enabled and self.aux_loss_coef == 0.0: + warnings.warn( + "You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to " + "`0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value " + "greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary " + "loss.", + UserWarning, + ) + + # The trainer estimates the number of FLOPs [floating-point operations] using the number of elements in the + # input tensor associated with the key "input_ids". However, in KTO, the sampled data does not include the + # "input_ids" key. Instead, the available keys are "prompt_input_ids" and "completion_input_ids". As a result, + # the trainer issues the warning: "Could not estimate the number of tokens of the input, floating-point + # operations will not be computed." To suppress this warning, we set the "estimate_tokens" key in the model's + # "warnings_issued" dictionary to True. This acts as a flag to indicate that the warning has already been + # issued. + model.warnings_issued["estimate_tokens"] = True + + # Compute that only on the main process for faster data processing. + # see: https://github.com/huggingface/trl/pull/1255 + with PartialState().main_process_first(): + # Extract the prompt if needed + train_dataset = train_dataset.map( + maybe_extract_prompt, num_proc=args.dataset_num_proc, desc="Extracting prompt from train dataset" + ) + # Unpair the dataset if needed + train_dataset = maybe_unpair_preference_dataset( + train_dataset, args.dataset_num_proc, desc="Unpairing train dataset" + ) + # Apply the chat template if needed + train_dataset = train_dataset.map( + maybe_apply_chat_template, + fn_kwargs={"tokenizer": processing_class}, + num_proc=args.dataset_num_proc, + desc="Applying chat template to train dataset", + ) + if eval_dataset is not None: + eval_dataset = eval_dataset.map( + maybe_extract_prompt, num_proc=args.dataset_num_proc, desc="Extracting prompt from eval dataset" + ) + eval_dataset = maybe_unpair_preference_dataset( + eval_dataset, args.dataset_num_proc, desc="Unpairing eval dataset" + ) + eval_dataset = eval_dataset.map( + maybe_apply_chat_template, + fn_kwargs={"tokenizer": processing_class}, + num_proc=args.dataset_num_proc, + desc="Applying chat template to eval dataset", + ) + + # Tokenize and prepare the training datasets + train_dataset = train_dataset.map( + _tokenize, + batched=True, + fn_kwargs={"tokenizer": self.processing_class}, + num_proc=args.dataset_num_proc, + desc="Tokenizing train dataset", + ) + + fn_kwargs = { + "prefix": "", + "is_encoder_decoder": self.is_encoder_decoder, + "tokenizer": self.processing_class, + "max_length": self.max_length, + "truncation_mode": self.truncation_mode, + "label_pad_token_id": self.label_pad_token_id, + "max_prompt_length": self.max_prompt_length, + "max_completion_length": self.max_completion_length, + } + + train_dataset = train_dataset.map( + _process_tokens, + fn_kwargs=fn_kwargs, + num_proc=args.dataset_num_proc, + desc="Processing tokenized train dataset", + ) + + # Tokenize and prepare the eval datasets + if eval_dataset is not None: + eval_dataset = eval_dataset.map( + _tokenize, + fn_kwargs={"tokenizer": self.processing_class}, + batched=True, + num_proc=args.dataset_num_proc, + desc="Tokenizing eval dataset", + ) + + eval_dataset = eval_dataset.map( + _process_tokens, + fn_kwargs=fn_kwargs, + num_proc=args.dataset_num_proc, + desc="Processing tokenized eval dataset", + ) + + # Get KL datasets if needed + if self.calculate_KL: + if args.per_device_train_batch_size <= 1: + raise ValueError( + "Actual (not effective) batch size must be > 1. KTO will not work properly because the KL term will be equivalent to the implied reward." + ) + + # create pairs for estimating the KL term by flipping the matched pairs in each batch of size total_batch_size + # i.e., [x_1, y_1], ..., [x_n, y_n] --> [x_1, y_n], ..., [x_n, y_1] = [x'_1, y'_1], ..., [x'_n, y'_n] + train_kl_dataset = train_dataset.map( + _get_kl_dataset, + batched=True, + batch_size=args.per_device_train_batch_size, + num_proc=args.dataset_num_proc, + desc="Extracting KL train dataset", + ) + + fn_kwargs["prefix"] = "KL_" + train_kl_dataset = train_kl_dataset.map( + _process_tokens, + fn_kwargs=fn_kwargs, + num_proc=args.dataset_num_proc, + remove_columns=[c for c in train_kl_dataset.column_names if c in train_dataset.column_names], + desc="Processing tokenized train KL dataset", + ) + + # merge the datasets + train_dataset = concatenate_datasets([train_dataset, train_kl_dataset], axis=1) + + if eval_dataset is not None: + # Get KL dataset + eval_kl_dataset = eval_dataset.map( + _get_kl_dataset, + batched=True, + batch_size=args.per_device_train_batch_size, + num_proc=args.dataset_num_proc, + desc="Extracting eval KL dataset", + ) + + eval_kl_dataset = eval_kl_dataset.map( + _process_tokens, + fn_kwargs=fn_kwargs, + num_proc=args.dataset_num_proc, + remove_columns=[c for c in eval_kl_dataset.column_names if c in eval_dataset.column_names], + desc="Processing tokenized eval KL dataset", + ) + + # merge the datasets + eval_dataset = concatenate_datasets([eval_dataset, eval_kl_dataset], axis=1) + + # calculate dataset desirability balance + num_desirable = max(sum(train_dataset["label"]), 1) + num_undesirable = max(len(train_dataset["label"]) - num_desirable, 1) # "label" is binary + + if num_desirable != num_undesirable: + # The lower and upper bounds come from Eq. [8] of https://huggingface.co/papers/2402.01306 + des_weight_lower_bound = round((num_undesirable * self.undesirable_weight / num_desirable) * 1, 2) + des_weight_upper_bound = round((num_undesirable * self.undesirable_weight / num_desirable) * 1.33, 2) + und_weight_lower_bound = round((num_desirable * self.desirable_weight / num_undesirable) / 1.33, 2) + und_weight_upper_bound = round((num_desirable * self.desirable_weight / num_undesirable) / 1, 2) + + des_weight_in_range = des_weight_lower_bound <= self.desirable_weight <= des_weight_upper_bound + und_weight_in_range = und_weight_lower_bound <= self.undesirable_weight <= und_weight_upper_bound + + if not (des_weight_in_range or und_weight_in_range): + warnings.warn( + "You have different amounts of desirable/positive and undesirable/negative examples but the " + "weights on the desirable and undesirable losses don't seem to be in an ideal range. Based " + f"on your data, we recommend EITHER " + f"desirable_weight in [{des_weight_lower_bound}, {des_weight_upper_bound}] or " + f"undesirable_weight in [{und_weight_lower_bound}, {und_weight_upper_bound}] (but NOT BOTH). " + "See the documentation on how to optimally set these weights.", + UserWarning, + ) + + super().__init__( + model=model, + args=args, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + processing_class=processing_class, + model_init=model_init, + compute_metrics=compute_metrics, + callbacks=callbacks, + optimizers=optimizers, + preprocess_logits_for_metrics=preprocess_logits_for_metrics, + ) + + # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the + # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set + # self.model_accepts_loss_kwargs to False to enable scaling. + self.model_accepts_loss_kwargs = False + + # Add tags for models that have been loaded with the correct transformers version + if hasattr(self.model, "add_model_tags"): + self.model.add_model_tags(self._tag_names) + + if not hasattr(self, "accelerator"): + raise AttributeError( + "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`." + ) + + # Deepspeed Zero-3 does not support precompute_ref_log_probs + if self.is_deepspeed_enabled: + if self.accelerator.state.deepspeed_plugin.zero_stage == 3 and self.precompute_ref_log_probs: + raise ValueError( + "You cannot use `precompute_ref_log_probs=True` with Deepspeed ZeRO-3. Please set `precompute_ref_log_probs=False`." + ) + + if self.ref_model is None: + if not (self.is_peft_model or self.precompute_ref_log_probs): + raise ValueError( + "No reference model and model is not a Peft model. Try setting `precompute_ref_log_probs=True`" + ) + else: + if self.is_deepspeed_enabled: + self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator) + else: + self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True) + + # Import Liger loss if enabled + if self.args.use_liger_loss: + if not is_liger_kernel_available(): + raise ImportError( + "You set `use_liger_loss=True` but the liger kernel is not available. " + "Please install liger-kernel first: `pip install liger-kernel`" + ) + if self.loss_type in ["apo_zero_unpaired"]: + raise ValueError( + "You cannot set `loss_type='apo_zero_unpaired'` with liger-kernel." + "Only KTO loss is supported with liger-kernel." + ) + if self.precompute_ref_log_probs: + raise ValueError( + "You cannot use `precompute_ref_log_probs=True` with liger kernel. Please set " + "`precompute_ref_log_probs=False`." + ) + if self.is_peft_model or self.ref_adapter_name is not None: + raise ValueError( + "You cannot use `use_liger_loss=True` with Peft models. Please set `use_liger_loss=False`." + ) + self.kto_loss_fn = LigerFusedLinearKTOLoss( + ignore_index=self.label_pad_token_id, beta=self.beta, use_ref_model=(self.ref_model is not None) + ) + + @contextmanager + def null_ref_context(self): + """Context manager for handling null reference model (that is, peft adapter manipulation).""" + with ( + self.accelerator.unwrap_model(self.model).disable_adapter() + if self.is_peft_model and not self.ref_adapter_name + else nullcontext() + ): + if self.ref_adapter_name: + self.model.set_adapter(self.ref_adapter_name) + yield + if self.ref_adapter_name: + self.model.set_adapter(self.model_adapter_name or "default") + + def get_train_dataloader(self) -> DataLoader: + """ + Returns the training [`~torch.utils.data.DataLoader`]. + + Subclass of transformers.src.transformers.trainer.get_train_dataloader to precompute `ref_log_probs`. + """ + + if self.precompute_ref_log_probs and not self._precomputed_train_ref_log_probs: + dataloader_params = { + "batch_size": self.args.per_device_train_batch_size, + "collate_fn": self.data_collator, + "num_workers": self.args.dataloader_num_workers, + "pin_memory": self.args.dataloader_pin_memory, + "shuffle": False, + } + + # prepare dataloader + data_loader = self.accelerator.prepare(DataLoader(self.train_dataset, **dataloader_params)) + reference_completion_logps = [] + reference_KL_logps = [] + + for padded_batch in tqdm(iterable=data_loader, desc="Train dataset reference log probs"): + reference_completion_logp, reference_KL_logp = self.compute_reference_log_probs(padded_batch) + + reference_completion_logp = self.accelerator.gather_for_metrics(reference_completion_logp) + reference_completion_logps.append(reference_completion_logp.cpu()) + + if self.calculate_KL: + reference_KL_logp = self.accelerator.gather_for_metrics(reference_KL_logp) + reference_KL_logps.append(reference_KL_logp.cpu()) + + self.train_dataset = self.train_dataset.add_column( + name="reference_logps", column=torch.cat(reference_completion_logps).float().numpy() + ) + + if self.calculate_KL: + self.train_dataset = self.train_dataset.add_column( + name="reference_KL_logps", column=torch.cat(reference_KL_logps).float().numpy() + ) + + self._precomputed_train_ref_log_probs = True + + return super().get_train_dataloader() + + def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader: + """ + Returns the evaluation [`~torch.utils.data.DataLoader`]. + + Subclass of transformers.src.transformers.trainer.get_eval_dataloader to precompute `ref_log_probs`. + + Args: + eval_dataset (`torch.utils.data.Dataset`, *optional*): + If provided, will override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns not accepted + by the `model.forward()` method are automatically removed. It must implement `__len__`. + """ + if eval_dataset is None and self.eval_dataset is None: + raise ValueError("Trainer: evaluation requires an eval_dataset.") + eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset + + if self.precompute_ref_log_probs and not self._precomputed_eval_ref_log_probs: + dataloader_params = { + "batch_size": self.args.per_device_eval_batch_size, + "collate_fn": self.data_collator, + "num_workers": self.args.dataloader_num_workers, + "pin_memory": self.args.dataloader_pin_memory, + "shuffle": False, + } + + # prepare dataloader + data_loader = self.accelerator.prepare(DataLoader(eval_dataset, **dataloader_params)) + + reference_completion_logps = [] + reference_KL_logps = [] + + for padded_batch in tqdm(iterable=data_loader, desc="Eval dataset reference log probs"): + reference_completion_logp, reference_KL_logp = self.compute_reference_log_probs(padded_batch) + + reference_completion_logp = self.accelerator.gather_for_metrics(reference_completion_logp) + reference_completion_logps.append(reference_completion_logp.cpu()) + + if self.calculate_KL: + reference_KL_logp = self.accelerator.gather_for_metrics(reference_KL_logp) + reference_KL_logps.append(reference_KL_logp.cpu()) + + eval_dataset = eval_dataset.add_column( + name="reference_logps", column=torch.cat(reference_completion_logps).float().numpy() + ) + if self.calculate_KL: + eval_dataset = eval_dataset.add_column( + name="reference_KL_logps", column=torch.cat(reference_KL_logps).float().numpy() + ) + + # Save calculated reference_chosen_logps and reference_rejected_logps to the eval_dataset for subsequent runs + if self.eval_dataset is not None: + self.eval_dataset = eval_dataset + self._precomputed_eval_ref_log_probs = True + + return super().get_eval_dataloader(eval_dataset=eval_dataset) + + def compute_reference_log_probs(self, padded_batch: dict) -> dict: + """Computes log probabilities of the reference model for a single padded batch of a KTO specific dataset.""" + with torch.no_grad(): + if self.ref_model is None: + with self.null_ref_context(): + if self.is_encoder_decoder: + completion_logits = self.model( + padded_batch["prompt_input_ids"], + attention_mask=padded_batch["prompt_attention_mask"], + decoder_input_ids=padded_batch.get("completion_decoder_input_ids"), + labels=padded_batch["completion_labels"], + ).logits + + if self.calculate_KL: + KL_logits = self.model( + padded_batch["KL_prompt_input_ids"], + attention_mask=padded_batch["KL_prompt_attention_mask"], + decoder_input_ids=padded_batch.get("KL_completion_decoder_input_ids"), + labels=padded_batch["KL_completion_labels"], + ).logits + else: + completion_logits = self.model( + padded_batch["completion_input_ids"], + attention_mask=padded_batch["completion_attention_mask"], + ).logits + + if self.calculate_KL: + KL_logits = self.model( + padded_batch["KL_completion_input_ids"], + attention_mask=padded_batch["KL_completion_attention_mask"], + ).logits + else: + if self.is_encoder_decoder: + completion_logits = self.ref_model( + padded_batch["prompt_input_ids"], + attention_mask=padded_batch["prompt_attention_mask"], + decoder_input_ids=padded_batch.get("completion_decoder_input_ids"), + labels=padded_batch["completion_labels"], + ).logits + + if self.calculate_KL: + KL_logits = self.ref_model( + padded_batch["KL_prompt_input_ids"], + attention_mask=padded_batch["KL_prompt_attention_mask"], + decoder_input_ids=padded_batch.get("KL_completion_decoder_input_ids"), + labels=padded_batch["KL_completion_labels"], + ).logits + else: + completion_logits = self.ref_model( + padded_batch["completion_input_ids"], attention_mask=padded_batch["completion_attention_mask"] + ).logits + + if self.calculate_KL: + KL_logits = self.ref_model( + padded_batch["KL_completion_input_ids"], + attention_mask=padded_batch["KL_completion_attention_mask"], + ).logits + + completion_logps = self.get_batch_logps( + completion_logits, + padded_batch["completion_labels"], + average_log_prob=False, + is_encoder_decoder=self.is_encoder_decoder, + label_pad_token_id=self.label_pad_token_id, + ) + + if self.calculate_KL: + KL_logps = self.get_batch_logps( + KL_logits, + padded_batch["KL_completion_labels"], + average_log_prob=False, + is_encoder_decoder=self.is_encoder_decoder, + label_pad_token_id=self.label_pad_token_id, + ) + else: + KL_logps = None + + return completion_logps, KL_logps + + @staticmethod + def get_batch_logps( + logits: torch.FloatTensor, + labels: torch.LongTensor, + average_log_prob: bool = False, + label_pad_token_id: int = -100, + is_encoder_decoder: bool = False, + ) -> torch.FloatTensor: + """Compute the log probabilities of the given labels under the given logits. + + Args: + logits: + Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size) + labels: + Labels for which to compute the log probabilities. Label tokens with a value of label_pad_token_id are + ignored. Shape: (batch_size, sequence_length) + average_log_prob: + If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the + log probabilities of the (non-masked) tokens. + + Returns: + A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the + given logits. + """ + if logits.shape[:-1] != labels.shape: + raise ValueError("Logits (batch and sequence length dim) and labels must have the same shape.") + + if not is_encoder_decoder: + labels = labels[:, 1:].clone() + logits = logits[:, :-1, :] + else: + # Fixes end-dec RuntimeError + labels = labels.clone() + + loss_mask = labels != label_pad_token_id + + # dummy token; we'll ignore the losses on these tokens later + labels[labels == label_pad_token_id] = 0 + + per_token_logps = selective_log_softmax(logits, labels) + + if average_log_prob: + return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1) + else: + return (per_token_logps * loss_mask).sum(-1) + + def forward( + self, model: nn.Module, batch: dict[str, Union[list, torch.LongTensor]] + ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: + KL_logps = self._compute_kl_logps(model, batch) + + model_kwargs = ( + { + "labels": batch["completion_labels"], + "decoder_input_ids": batch.get("completion_decoder_input_ids"), + } + if self.is_encoder_decoder + else {} + ) + if self.aux_loss_enabled: + model_kwargs["output_router_logits"] = True + + outputs = model( + batch["completion_input_ids"], + attention_mask=batch["completion_attention_mask"], + **model_kwargs, + ) + completion_logits = outputs.logits + + completion_logps = self.get_batch_logps( + completion_logits, + batch["completion_labels"], + average_log_prob=False, + is_encoder_decoder=self.is_encoder_decoder, + label_pad_token_id=self.label_pad_token_id, + ) + + if completion_logps.shape[0] != len(batch["label"]): + raise ValueError( + "There is a mismatch between the number of examples in this batch and the number of " + "examples for which an output sequence was predicted." + ) + + chosen_idx = [i for i in range(completion_logps.shape[0]) if batch["label"][i] is True] + rejected_idx = [i for i in range(completion_logps.shape[0]) if batch["label"][i] is False] + + chosen_logps = completion_logps[chosen_idx, ...] + rejected_logps = completion_logps[rejected_idx, ...] + + chosen_logits = completion_logits[chosen_idx, ...] + rejected_logits = completion_logits[rejected_idx, ...] + + if self.aux_loss_enabled: + return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, KL_logps, outputs.aux_loss) + else: + return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, KL_logps) + + def kto_loss( + self, + policy_chosen_logps: torch.FloatTensor, + policy_rejected_logps: torch.FloatTensor, + policy_KL_logps: torch.FloatTensor, + reference_chosen_logps: torch.FloatTensor, + reference_rejected_logps: torch.FloatTensor, + reference_KL_logps: torch.FloatTensor, + ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: + """Compute the KTO loss for a batch of policy and reference model log probabilities. + + Args: + policy_chosen_logps: + Log probabilities of the policy model for the chosen responses. Shape: (num(chosen) in batch_size,) + policy_rejected_logps: + Log probabilities of the policy model for the rejected responses. Shape: (num(rejected) in batch_size,) + policy_KL_logps: Log probabilities of the policy model for the KL responses. Shape: (batch_size,) + reference_chosen_logps: + Log probabilities of the reference model for the chosen responses. Shape: (num(chosen) in batch_size,) + reference_rejected_logps: + Log probabilities of the reference model for the rejected responses. Shape: (num(rejected) in + batch_size,) + reference_KL_logps: Log probabilities of the reference model for the KL responses. Shape: (batch_size,) + + Returns: + A tuple of four tensors: (losses, chosen_rewards, rejected_rewards, KL). The losses tensor contains the KTO + loss for each example in the batch. The chosen_rewards and rejected_rewards tensors contain the rewards for + the chosen and rejected responses, respectively. The KL tensor contains the detached KL divergence estimate + between the policy and reference models. + """ + if self.calculate_KL: + kl = (policy_KL_logps - reference_KL_logps).mean().detach() + kl = self.accelerator.gather_for_metrics(kl).mean().clamp(min=0) + else: + kl = torch.zeros(1).to(policy_chosen_logps.device) + + # Chosen losses + if policy_chosen_logps.shape[0] != 0 or reference_chosen_logps.shape[0] != 0: + chosen_logratios = policy_chosen_logps - reference_chosen_logps + + if self.loss_type == "kto": + # Eqn (7) of the KTO paper (https://huggingface.co/papers/2402.01306) + chosen_losses = 1 - F.sigmoid(self.beta * (chosen_logratios - kl)) + elif self.loss_type == "apo_zero_unpaired": + # Unpaired variant of Eqn (7) of the APO paper (https://huggingface.co/papers/2408.06266) + # Use this loss when you believe the chosen outputs are better than your model's default output + chosen_losses = 1 - F.sigmoid(self.beta * chosen_logratios) + + chosen_rewards = self.beta * chosen_logratios.detach() + + else: + # lists can't be empty -- if they are, then accelerate.gather will hang + chosen_losses = torch.Tensor([]).to(self.accelerator.device) + chosen_rewards = torch.Tensor([]).to(self.accelerator.device) + + # Rejected losses + if policy_rejected_logps.shape[0] != 0 or reference_rejected_logps.shape[0] != 0: + rejected_logratios = policy_rejected_logps - reference_rejected_logps + + if self.loss_type == "kto": + rejected_losses = 1 - F.sigmoid(self.beta * (kl - rejected_logratios)) + elif self.loss_type == "apo_zero_unpaired": + rejected_losses = F.sigmoid(self.beta * rejected_logratios) + + rejected_rewards = self.beta * rejected_logratios.detach() + else: + # lists can't be empty -- if they are, then accelerate.gather will hang + rejected_losses = torch.Tensor([]).to(self.accelerator.device) + rejected_rewards = torch.Tensor([]).to(self.accelerator.device) + + losses = torch.cat( + (self.desirable_weight * chosen_losses, self.undesirable_weight * rejected_losses), + 0, + ) + + return losses, chosen_rewards, rejected_rewards, kl + + def _compute_kl_logps(self, model, batch): + """Compute KL log probabilities for a given batch.""" + KL_logps = None + if self.calculate_KL: + if self.is_encoder_decoder: + KL_model_kwargs = { + "input_ids": batch["KL_prompt_input_ids"], + "attention_mask": batch["KL_prompt_attention_mask"], + "labels": batch["KL_completion_labels"], + "decoder_input_ids": batch.get("KL_completion_decoder_input_ids"), + } + else: + KL_model_kwargs = { + "input_ids": batch["KL_completion_input_ids"], + "attention_mask": batch["KL_completion_attention_mask"], + } + + with torch.no_grad(): + KL_logits = model(**KL_model_kwargs).logits + + KL_logps = self.get_batch_logps( + KL_logits, + batch["KL_completion_labels"], + average_log_prob=False, + is_encoder_decoder=self.is_encoder_decoder, + label_pad_token_id=self.label_pad_token_id, + ) + return KL_logps + + def _compute_loss_liger(self, model, batch): + """ + Compute the KTO loss using the Liger-Kernel's LigerFusedLinearKTOLoss. + + Args: + model: + The policy model used for generating log probabilities and outputs. It could be an encoder-decoder + model or a regular language model. + batch: A dictionary containing the input data and labels for the batch. + + Returns: + A dictionary containing the following keys: + - "loss": The computed KTO loss for the batch. + - "chosen_logits_sum": Sum of the logits for the chosen responses from the policy model. + - "rejected_logits_sum": Sum of the logits for the rejected responses from the policy model. + - "chosen_logps": Log probabilities of the chosen responses from the policy model. + - "rejected_logps": Log probabilities of the rejected responses from the policy model. + - "chosen_rewards": Rewards for the chosen responses. + - "rejected_rewards": Rewards for the rejected responses. + - "kl": The KL divergence between the policy and reference models (detached). + + If auxiliary loss is enabled, the dictionary will also include: + - "aux_loss": The auxiliary loss from the model outputs. + """ + policy_KL_logps = self._compute_kl_logps(model, batch) + reference_KL_logps = self._compute_kl_logps(self.ref_model, batch) + if self.calculate_KL: + kl = (policy_KL_logps - reference_KL_logps).mean().detach() + kl = self.accelerator.gather_for_metrics(kl).mean().clamp(min=0) + else: + kl = torch.zeros(1).to(self.accelerator.device) + + model_kwargs = ( + { + "labels": batch["completion_labels"], + "decoder_input_ids": batch.get("completion_decoder_input_ids"), + } + if self.is_encoder_decoder + else {} + ) + if self.aux_loss_enabled: + model_kwargs["output_router_logits"] = True + + if self.is_encoder_decoder: + # 1. Get encoder outputs + encoder_outputs = model.get_encoder()( + batch["completion_input_ids"], + attention_mask=batch["completion_attention_mask"], + return_dict=True, + **model_kwargs, + ) + # 2. Get decoder outputs + outputs = model.get_decoder()( + input_ids=model_kwargs["decoder_input_ids"], + encoder_hidden_states=encoder_outputs.last_hidden_state, + use_cache=False, + **model_kwargs, + ) + # 1. Get reference encoder outputs + ref_encoder_outputs = self.ref_model.get_encoder()( + batch["completion_input_ids"], + attention_mask=batch["completion_attention_mask"], + return_dict=True, + **model_kwargs, + ) + # 2. Get reference decoder outputs + ref_outputs = self.ref_model.get_decoder()( + input_ids=model_kwargs["decoder_input_ids"], + encoder_hidden_states=ref_encoder_outputs.last_hidden_state, + use_cache=False, + **model_kwargs, + ) + else: + # skip the lm head and get the last hidden state + if hasattr(model, "get_decoder"): + base_model = model.get_decoder() + else: + base_model = getattr(model, self.args.base_model_attribute_name) + outputs = base_model( + batch["completion_input_ids"], + attention_mask=batch["completion_attention_mask"], + use_cache=False, + **model_kwargs, + ) + + # reference model + if hasattr(self.ref_model, "get_decoder"): + ref_base_model = self.ref_model.get_decoder() + else: + ref_base_model = getattr(self.ref_model, self.args.base_model_attribute_name) + ref_outputs = ref_base_model( + batch["completion_input_ids"], + attention_mask=batch["completion_attention_mask"], + use_cache=False, + **model_kwargs, + ) + lm_head = model.get_output_embeddings() + ref_lm_head = self.ref_model.get_output_embeddings() + + ( + loss, + ( + chosen_logps_sum, + rejected_logps_sum, + chosen_logits_sum, + rejected_logits_sum, + chosen_rewards_sum, + rejected_rewards_sum, + ), + ) = self.kto_loss_fn( + _input=outputs.last_hidden_state[:, :-1] if not self.is_encoder_decoder else outputs.last_hidden_state, + lin_weight=lm_head.weight, + target=batch["completion_labels"][:, 1:], + bias=lm_head.bias if hasattr(lm_head, "bias") else None, + preference_labels=torch.tensor(batch["label"], dtype=torch.bool).to(self.accelerator.device), + ref_input=ref_outputs.last_hidden_state[:, :-1] + if not self.is_encoder_decoder + else outputs.last_hidden_state, + ref_weight=ref_lm_head.weight, + ref_bias=ref_lm_head.bias if hasattr(lm_head, "bias") else None, + kl=kl, + ) + + output = { + "loss": loss, + "chosen_logits_sum": chosen_logits_sum, + "rejected_logits_sum": rejected_logits_sum, + "chosen_logps_sum": chosen_logps_sum, + "rejected_logps_sum": rejected_logps_sum, + "chosen_rewards_sum": chosen_rewards_sum, + "rejected_rewards_sum": rejected_rewards_sum, + "kl": kl, + } + if self.aux_loss_enabled: + output["aux_loss"] = outputs.aux_loss + + return output + + def get_batch_loss_metrics( + self, + model, + batch: dict[str, Union[list, torch.LongTensor]], + ): + """Compute the KTO loss and other metrics for the given batch of inputs for train or test.""" + metrics = {} + batch = {k: (v.to(self.accelerator.device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()} + + labels = torch.tensor(batch["label"]) + num_chosen = labels.sum().to(self.accelerator.device) + num_rejected = (len(labels) - num_chosen).to(self.accelerator.device) + + if self.args.use_liger_loss: + model_output = self._compute_loss_liger(model, batch) + losses = model_output["loss"] + policy_chosen_logits = model_output["chosen_logits_sum"] + policy_rejected_logits = model_output["rejected_logits_sum"] + policy_chosen_logps = model_output["chosen_logps_sum"] + policy_rejected_logps = model_output["rejected_logps_sum"] + chosen_rewards = model_output["chosen_rewards_sum"] + rejected_rewards = model_output["rejected_rewards_sum"] + kl = model_output["kl"] + if self.aux_loss_enabled: + aux_loss = model_output["aux_loss"] + else: + forward_output = self.forward(model, batch) + ( + policy_chosen_logps, + policy_rejected_logps, + policy_chosen_logits, + policy_rejected_logits, + policy_KL_logps, + ) = forward_output[:5] + if self.aux_loss_enabled: + aux_loss = forward_output[5] + + # if reference_logps in batch use them, otherwise use the reference model + if "reference_logps" in batch: + chosen_idx = [i for i in range(batch["reference_logps"].shape[0]) if batch["label"][i] is True] + rejected_idx = [i for i in range(batch["reference_logps"].shape[0]) if batch["label"][i] is False] + + reference_chosen_logps = batch["reference_logps"][chosen_idx, ...] + reference_rejected_logps = batch["reference_logps"][rejected_idx, ...] + if self.calculate_KL: + reference_KL_logps = batch["reference_KL_logps"] + else: + reference_KL_logps = None + else: + with torch.no_grad(): + if self.ref_model is None: + with self.null_ref_context(): + ( + reference_chosen_logps, + reference_rejected_logps, + _, + _, + reference_KL_logps, + ) = self.forward(self.model, batch)[:5] + else: + ( + reference_chosen_logps, + reference_rejected_logps, + _, + _, + reference_KL_logps, + ) = self.forward(self.ref_model, batch)[:5] + + losses, chosen_rewards, rejected_rewards, kl = self.kto_loss( + policy_chosen_logps, + policy_rejected_logps, + policy_KL_logps, + reference_chosen_logps, + reference_rejected_logps, + reference_KL_logps, + ) + + metrics["kl"] = kl.item() + + all_num_chosen = self.accelerator.gather_for_metrics(num_chosen).sum().item() + all_num_rejected = self.accelerator.gather_for_metrics(num_rejected).sum().item() + + if all_num_chosen > 0: + metrics["rewards/chosen_sum"] = ( + self.accelerator.gather_for_metrics(chosen_rewards.nansum()).nansum().item() + ) + metrics["logps/chosen_sum"] = ( + self.accelerator.gather_for_metrics(policy_chosen_logps.nansum()).nansum().item() + ) + metrics["logits/chosen_sum"] = ( + self.accelerator.gather_for_metrics(policy_chosen_logits.nansum()).nansum().item() + ) + metrics["count/chosen"] = all_num_chosen + + if all_num_rejected > 0: + metrics["rewards/rejected_sum"] = ( + self.accelerator.gather_for_metrics(rejected_rewards.nansum()).nansum().item() + ) + metrics["logps/rejected_sum"] = ( + self.accelerator.gather_for_metrics(policy_rejected_logps.nansum()).nansum().item() + ) + metrics["logits/rejected_sum"] = ( + self.accelerator.gather_for_metrics(policy_rejected_logits.nansum()).nansum().item() + ) + metrics["count/rejected"] = all_num_rejected + + loss = losses.nanmean() + if self.aux_loss_enabled: + loss += self.aux_loss_coef * aux_loss + + return loss, metrics + + def compute_loss( + self, + model: Union[PreTrainedModel, nn.Module], + inputs: dict[str, Union[torch.Tensor, Any]], + return_outputs=False, + num_items_in_batch=None, + ) -> Union[torch.Tensor, tuple[torch.Tensor, dict[str, torch.Tensor]]]: + compute_loss_context_manager = ( + autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext() + ) + + with compute_loss_context_manager: + loss, metrics = self.get_batch_loss_metrics(model, inputs) + + # Make sure to move the loss to the device the original accumulating loss is at back in the `Trainer` class: + loss = loss.to(self.args.device) + # force log the metrics + if self.accelerator.is_main_process: + self.store_metrics(metrics, train_eval="train") + + if return_outputs: + return (loss, metrics) + return loss + + def store_metrics(self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None: + for key, value in metrics.items(): + self._stored_metrics[train_eval][key].append(value) + + def _get_train_sampler(self, dataset: Optional[Dataset] = None) -> Optional[torch.utils.data.Sampler]: + if dataset is None: + dataset = self.train_dataset + if dataset is None or not has_length(dataset): + return None + return SequentialSampler(dataset) + + def generate_from_model_and_ref(self, model, batch: dict[str, torch.LongTensor]) -> tuple[str, str]: + """Generate samples from the model and reference model for the given batch of inputs.""" + + # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with + # the torch amp context manager as some hidden states are silently casted to full precision. + generate_context_manager = ( + autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext() + ) + + with generate_context_manager: + policy_output = model.generate( + input_ids=batch["prompt_input_ids"], + attention_mask=batch["prompt_attention_mask"], + max_length=self.max_length, + do_sample=True, + pad_token_id=self.processing_class.pad_token_id, + ) + + # if reference_output in batch use that otherwise use the reference model + if "reference_output" in batch: + reference_output = batch["reference_output"] + else: + if self.ref_model is None: + with self.null_ref_context(): + reference_output = self.model.generate( + input_ids=batch["prompt_input_ids"], + attention_mask=batch["prompt_attention_mask"], + max_length=self.max_length, + do_sample=True, + pad_token_id=self.processing_class.pad_token_id, + ) + else: + reference_output = self.ref_model.generate( + input_ids=batch["prompt_input_ids"], + attention_mask=batch["prompt_attention_mask"], + max_length=self.max_length, + do_sample=True, + pad_token_id=self.processing_class.pad_token_id, + ) + + policy_output = pad_to_length(policy_output, self.max_length, self.processing_class.pad_token_id) + policy_output_decoded = self.processing_class.batch_decode(policy_output, skip_special_tokens=True) + + reference_output = pad_to_length(reference_output, self.max_length, self.processing_class.pad_token_id) + reference_output_decoded = self.processing_class.batch_decode(reference_output, skip_special_tokens=True) + + return policy_output_decoded, reference_output_decoded + + def prediction_step( + self, + model: Union[PreTrainedModel, nn.Module], + inputs: dict[str, Union[torch.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[list[str]] = None, + ): + if ignore_keys is None: + if hasattr(model, "config"): + ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", []) + else: + ignore_keys = [] + + prediction_context_manager = ( + autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext() + ) + with torch.no_grad(), prediction_context_manager: + loss, metrics = self.get_batch_loss_metrics(model, inputs) + + # force log the metrics + if self.accelerator.is_main_process: + self.store_metrics(metrics, train_eval="eval") + + if prediction_loss_only: + return (loss.detach(), None, None) + + # logits for the chosen and rejected samples from model + logits_dict = {} + if "logits/chosen_sum" in metrics: + logits_dict["eval_logits/chosen"] = metrics["logits/chosen_sum"] + if "logits/rejected_sum" in metrics: + logits_dict["eval_logits/rejected"] = metrics["logits/rejected_sum"] + logits = [v for k, v in logits_dict.items() if k not in ignore_keys] + logits = torch.tensor(logits, device=self.accelerator.device) + labels = torch.zeros(logits.shape[0], device=self.accelerator.device) + + return (loss.detach(), logits, labels) + + def evaluation_loop( + self, + dataloader: DataLoader, + description: str, + prediction_loss_only: Optional[bool] = None, + ignore_keys: Optional[list[str]] = None, + metric_key_prefix: str = "eval", + ) -> EvalLoopOutput: + """ + Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by + `Trainer.evaluate()` and `Trainer.predict()`. + + Works both with or without labels. + """ + + # Sample and save to game log if requested (for one batch to save time) + if self.generate_during_eval: + # Generate random indices within the range of the total number of samples + num_samples = len(dataloader.dataset) + random_indices = random.sample(range(num_samples), k=self.args.eval_batch_size) + + # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader + random_batch_dataset = dataloader.dataset.select(random_indices) + random_batch = self.data_collator(random_batch_dataset) + random_batch = self._prepare_inputs(random_batch) + + target_indicies = [i for i in range(len(random_batch["label"])) if random_batch["label"][i] is False] + target_batch = { + "prompt_input_ids": random_batch["prompt_input_ids"][target_indicies], + "prompt_attention_mask": random_batch["prompt_attention_mask"][target_indicies], + "prompt": itemgetter(*target_indicies)(random_batch["prompt"]), + } + policy_output_decoded, ref_output_decoded = self.generate_from_model_and_ref(self.model, target_batch) + + table = pd.DataFrame( + columns=["Prompt", "Policy", "Ref Model"], + data=[ + [prompt, pol[len(prompt) :], ref[len(prompt) :]] + for prompt, pol, ref in zip(target_batch["prompt"], policy_output_decoded, ref_output_decoded) + ], + ) + if "wandb" in self.args.report_to: + wandb.log({"game_log": wandb.Table(data=table)}) + + if "comet_ml" in self.args.report_to: + log_table_to_comet_experiment( + name="game_log.csv", + table=table, + ) + + # Base evaluation + initial_output = super().evaluation_loop( + dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix + ) + + return initial_output + + def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None: + """ + Log `logs` on the various objects watching training, including stored metrics. + + Args: + logs (`dict[str, float]`): + The values to log. + start_time (`float` or `None`, *optional*, defaults to `None`): + Start time of the training. + """ + # logs either has 'loss' or 'eval_loss' + train_eval = "train" if "loss" in logs else "eval" + # train metrics should have no prefix, eval should have 'eval_' + prefix = "eval_" if train_eval == "eval" else "" + # accumulate average metrics from sums and lengths + for split in ["chosen", "rejected"]: + if f"count/{split}" in self._stored_metrics[train_eval]: + count_sum = torch.Tensor(self._stored_metrics[train_eval][f"count/{split}"]).sum().item() + for metric in ["rewards", "logps", "logits"]: + logs[f"{prefix}{metric}/{split}"] = ( + torch.Tensor(self._stored_metrics[train_eval][f"{metric}/{split}_sum"]).sum().item() + / count_sum + ) + # delete obsolete metric + del self._stored_metrics[train_eval][f"{metric}/{split}_sum"] + del self._stored_metrics[train_eval][f"count/{split}"] + # calculate reward margin + if f"{prefix}rewards/chosen" in logs and f"{prefix}rewards/rejected" in logs: + logs[f"{prefix}rewards/margins"] = logs[f"{prefix}rewards/chosen"] - logs[f"{prefix}rewards/rejected"] + # Add averaged stored metrics to logs + for key, metrics in self._stored_metrics[train_eval].items(): + logs[f"{prefix}{key}"] = torch.Tensor(metrics).mean().item() + del self._stored_metrics[train_eval] + return super().log(logs, start_time) + + # Ensure the model card is saved along with the checkpoint + def _save_checkpoint(self, model, trial): + if self.args.hub_model_id is None: + model_name = Path(self.args.output_dir).name + else: + model_name = self.args.hub_model_id.split("/")[-1] + self.create_model_card(model_name=model_name) + super()._save_checkpoint(model, trial) + + def create_model_card( + self, + model_name: Optional[str] = None, + dataset_name: Optional[str] = None, + tags: Union[str, list[str], None] = None, + ): + """ + Creates a draft of a model card using the information available to the `Trainer`. + + Args: + model_name (`str` or `None`, *optional*, defaults to `None`): + Name of the model. + dataset_name (`str` or `None`, *optional*, defaults to `None`): + Name of the dataset used for training. + tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`): + Tags to be associated with the model card. + """ + if not self.is_world_process_zero(): + return + + if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path): + base_model = self.model.config._name_or_path + else: + base_model = None + + # normalize `tags` to a mutable set + if tags is None: + tags = set() + elif isinstance(tags, str): + tags = {tags} + else: + tags = set(tags) + + if hasattr(self.model.config, "unsloth_version"): + tags.add("unsloth") + + tags.update(self._tag_names) + + citation = textwrap.dedent("""\ + @article{ethayarajh2024kto, + title = {{KTO: Model Alignment as Prospect Theoretic Optimization}}, + author = {Kawin Ethayarajh and Winnie Xu and Niklas Muennighoff and Dan Jurafsky and Douwe Kiela}, + year = 2024, + eprint = {arXiv:2402.01306}, + }""") + + model_card = generate_model_card( + base_model=base_model, + model_name=model_name, + hub_model_id=self.hub_model_id, + dataset_name=dataset_name, + tags=tags, + wandb_url=wandb.run.get_url() if is_wandb_available() and wandb.run is not None else None, + comet_url=get_comet_experiment_url(), + trainer_name="KTO", + trainer_citation=citation, + paper_title="KTO: Model Alignment as Prospect Theoretic Optimization", + paper_id="2402.01306", + ) + + model_card.save(os.path.join(self.args.output_dir, "README.md")) +class UnslothKTOTrainer(_UnslothKTOTrainer): + """ + + Initialize KTOTrainer. + + Args: + model (`transformers.PreTrainedModel`): + The model to train, preferably an `AutoModelForSequenceClassification`. + ref_model (`PreTrainedModelWrapper`): + Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation + and loss. If no reference model is provided, the trainer will create a reference model with the same + architecture as the model to be optimized. + args (`KTOConfig`): + The arguments to use for training. + train_dataset (`datasets.Dataset`): + The dataset to use for training. + eval_dataset (`datasets.Dataset`): + The dataset to use for evaluation. + processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*): + Processing class used to process the data. If provided, will be used to automatically process the inputs + for the model, and it will be saved along the model to make it easier to rerun an interrupted training or + reuse the fine-tuned model. + data_collator (`transformers.DataCollator`, *optional*, defaults to `None`): + The data collator to use for training. If None is specified, the default data collator + (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the + sequences in the batch, given a dataset of paired sequences. + model_init (`Callable[[], transformers.PreTrainedModel]`): + The model initializer to use for training. If None is specified, the default model initializer will be + used. + callbacks (`list[transformers.TrainerCallback]`): + The callbacks to use for training. + optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`): + The optimizer and scheduler to use for training. + preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`): + The function to use to preprocess the logits before computing the metrics. + peft_config (`dict`, defaults to `None`): + The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in + a PEFT model. + compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*): + The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to + metric values. + model_adapter_name (`str`, defaults to `None`): + Name of the train target PEFT adapter, when using LoRA with multiple adapters. + ref_adapter_name (`str`, defaults to `None`): + Name of the reference PEFT adapter, when using LoRA with multiple adapters. + + """ + def __init__( + self, + model = None, + ref_model = None, + args = None, + train_dataset = None, + eval_dataset = None, + processing_class = None, + data_collator = None, + model_init = None, + callbacks = None, + preprocess_logits_for_metrics = None, + peft_config = None, + compute_metrics = None, + model_adapter_name = None, + ref_adapter_name = None, + **kwargs + ): + if args is None: args = UnslothKTOConfig() + use_bf16 = getattr(args, 'bf16', False) + if type(use_bf16) is not bool: use_bf16 = False + use_fp16 = getattr(args, 'fp16', False) + if type(use_fp16) is not bool: use_fp16 = False + force_float32 = False + if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1': + print('Unsloth: Switching to float32 training since model cannot work with float16') + force_float32 = True + mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') + dtype = getattr(model.config, 'torch_dtype', None) + if dtype is None: dtype = model.get_input_embeddings().dtype + from unsloth_zoo.utils import _get_dtype + dtype = _get_dtype(dtype) + float16 = dtype == torch.float16 + if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`') + if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`') + if force_float32: + args.fp16 = False + args.bf16 = False + os.environ['ACCELERATE_MIXED_PRECISION'] = 'no' + elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32': + args.fp16 = float16 + args.bf16 = not float16 + os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16' + if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no': + args.eval_strategy = 'steps' + if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1 + ga_steps = getattr(args, 'gradient_accumulation_steps', None) + if ga_steps is not None and ga_steps > 1: + from transformers import __version__ as transformers_version + if Version(transformers_version) <= Version('4.45.2'): + print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n' + '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`') + if getattr(args, 'eval_strategy', 'no') != 'no': + eval_bsz = getattr(args, 'per_device_eval_batch_size', 8) + if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size + if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps + fp16_full_eval = getattr(args, 'fp16_full_eval', False) + if type(fp16_full_eval) is not bool: fp16_full_eval = False + bf16_full_eval = getattr(args, 'bf16_full_eval', False) + if type(bf16_full_eval) is not bool: bf16_full_eval = False + if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True + if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False + if force_float32: + args.bf16_full_eval = False + args.fp16_full_eval = False + elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16': + args.bf16_full_eval = True + args.fp16_full_eval = False + elif not bf16_full_eval and not fp16_full_eval: + args.bf16_full_eval = args.bf16 + args.fp16_full_eval = args.fp16 + _output_logits = False + if locals().get('compute_metrics', None) is not None: _output_logits = True + if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True + if _output_logits: + os.environ['UNSLOTH_RETURN_LOGITS'] = '1' + if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'): + pass + else: + model_max_seq_length = getattr(model, 'max_seq_length', None) + args_max_seq_length = getattr(args, 'max_seq_length', None) + if args_max_seq_length is None and model_max_seq_length is not None: + max_seq_length = model.max_seq_length + if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length + if model is not None and hasattr(model, 'for_training'): + model.for_training() + if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right' + if 'processing_class' in locals(): + if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right' + if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right' + __tokenizer = processing_class if 'processing_class' in locals() else tokenizer + from unsloth_zoo.vision_utils import UnslothVisionDataCollator + if not isinstance(data_collator, UnslothVisionDataCollator): + if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer, mlm = False, mlm_probability = 0.0) + elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names: + data_collator = DataCollatorForSeq2Seq(__tokenizer) + else: + if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False + if hasattr(args, 'dataset_text_field'): args.dataset_text_field = '' + if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True} + if not isinstance(data_collator, UnslothVisionDataCollator): + if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'): + if isinstance(data_collator, DataCollatorForSeq2Seq): + data_collator = DataCollatorForSeq2Seq(__tokenizer.tokenizer) + else: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer.tokenizer, mlm = False, mlm_probability = 0.0) + other_metrics = [] + + from unsloth_zoo.logging_utils import PatchRLStatistics + PatchRLStatistics('kto_trainer', other_metrics) + + super().__init__( + model = model, + ref_model = ref_model, + args = args, + train_dataset = train_dataset, + eval_dataset = eval_dataset, + processing_class = processing_class, + data_collator = data_collator, + model_init = model_init, + callbacks = callbacks, + preprocess_logits_for_metrics = preprocess_logits_for_metrics, + peft_config = peft_config, + compute_metrics = compute_metrics, + model_adapter_name = model_adapter_name, + ref_adapter_name = ref_adapter_name,**kwargs) + if hasattr(self, 'neftune_hook_handle'): + self.neftune_hook_handle.remove() + if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle + if getattr(args, 'neftune_noise_alpha', None) is not None: + model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha + pass + +pass diff --git a/unsloth_compiled_cache/UnslothNashMDTrainer.py b/unsloth_compiled_cache/UnslothNashMDTrainer.py new file mode 100644 index 0000000000000000000000000000000000000000..79bff280fffe1ca11da8131e1ac2c609d0ecd7be --- /dev/null +++ b/unsloth_compiled_cache/UnslothNashMDTrainer.py @@ -0,0 +1,993 @@ +""" +2025.7.4 +2025.7.3 +4.53.2 +0.19.1 +__UNSLOTH_VERSIONING__ +""" +from torch import Tensor +import torch +import torch.nn as nn +from torch.nn import functional as F +from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable +from trl.trainer.nash_md_trainer import (Any, BaseImageProcessor, BasePairwiseJudge, Callable, Dataset, EvalPrediction, F, FeatureExtractionMixin, GeometricMixtureWrapper, IterableDataset, NashMDConfig, NashMDTrainer, OnlineDPOTrainer, OptimizerNames, Optional, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, SIMPLE_CHAT_TEMPLATE, TrainerCallback, Union, empty_cache, generate_model_card, get_comet_experiment_url, get_reward, is_conversational, is_peft_available, is_wandb_available, jinja2, maybe_apply_chat_template, nn, os, textwrap, torch, truncate_right, unwrap_model_for_generation, wandb) + + +import os +from typing import * +from dataclasses import dataclass, field +from packaging.version import Version +import torch +import numpy as np +from contextlib import nullcontext +from torch.nn import functional as F +from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling + +torch_compile_options = { + "epilogue_fusion" : True, + "max_autotune" : False, + "shape_padding" : True, + "trace.enabled" : False, + "triton.cudagraphs" : False, +} + +@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) +def selective_log_softmax(logits, index): + logits = logits.to(torch.float32) + selected_logits = torch.gather(logits, dim = -1, index = index.unsqueeze(-1)).squeeze(-1) + # loop to reduce peak mem consumption + # logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits]) + logsumexp_values = torch.logsumexp(logits, dim = -1) + per_token_logps = selected_logits - logsumexp_values # log_softmax(x_i) = x_i - logsumexp(x) + return per_token_logps +@dataclass +class UnslothNashMDConfig(NashMDConfig): + """ + + Configuration class for the [`NashMDTrainer`]. + + Subclass of [`OnlineDPOConfig`] we can use all its arguments and add the following: + + Parameters: + mixture_coef (`float` or `list[float]`, *optional*, defaults to `0.5`): + Logit mixture coefficient for the model and reference model. If a list of floats is provided then the + mixture coefficient is selected for each new epoch and the last coefficient is used for the rest of the + epochs. + + """ + vllm_sampling_params: Optional[Any] = field( + default = None, + metadata = {'help': 'vLLM SamplingParams'}, + ) + unsloth_num_chunks : Optional[int] = field( + default = -1, + metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'}, + ) + def __init__( + self, + output_dir = None, + overwrite_output_dir = None, + do_train = False, + do_eval = False, + do_predict = False, + eval_strategy = 'no', + prediction_loss_only = False, + per_device_train_batch_size = 4, + per_device_eval_batch_size = 4, + per_gpu_train_batch_size = None, + per_gpu_eval_batch_size = None, + gradient_accumulation_steps = 2, + eval_accumulation_steps = 2, + eval_delay = 0, + torch_empty_cache_steps = 250, + learning_rate = 5e-05, + weight_decay = 0.01, + adam_beta1 = 0.9, + adam_beta2 = 0.999, + adam_epsilon = 1e-08, + max_grad_norm = 1.0, + num_train_epochs = 3.0, + max_steps = -1, + lr_scheduler_type = 'linear', + warmup_ratio = 0.1, + warmup_steps = 0, + log_level = 'passive', + log_level_replica = 'warning', + log_on_each_node = True, + logging_dir = None, + logging_strategy = 'steps', + logging_first_step = False, + logging_steps = 1, + logging_nan_inf_filter = False, + save_strategy = 'steps', + save_steps = 500, + save_total_limit = None, + save_safetensors = True, + save_on_each_node = False, + save_only_model = False, + restore_callback_states_from_checkpoint = False, + no_cuda = False, + use_cpu = False, + use_mps_device = False, + seed = 3407, + data_seed = 3407, + jit_mode_eval = False, + use_ipex = False, + bf16 = False, + fp16 = False, + fp16_opt_level = 'O1', + half_precision_backend = 'auto', + bf16_full_eval = False, + fp16_full_eval = False, + tf32 = None, + local_rank = -1, + ddp_backend = None, + tpu_num_cores = None, + tpu_metrics_debug = False, + debug = '', + dataloader_drop_last = False, + eval_steps = None, + dataloader_num_workers = 0, + dataloader_prefetch_factor = None, + past_index = -1, + run_name = None, + disable_tqdm = None, + remove_unused_columns = True, + label_names = None, + load_best_model_at_end = False, + metric_for_best_model = None, + greater_is_better = None, + ignore_data_skip = False, + fsdp = '', + fsdp_min_num_params = 0, + fsdp_config = None, + fsdp_transformer_layer_cls_to_wrap = None, + accelerator_config = None, + deepspeed = None, + label_smoothing_factor = 0.0, + optim = 'adamw_8bit', + optim_args = None, + adafactor = False, + group_by_length = False, + length_column_name = 'length', + report_to = None, + ddp_find_unused_parameters = None, + ddp_bucket_cap_mb = None, + ddp_broadcast_buffers = None, + dataloader_pin_memory = True, + dataloader_persistent_workers = False, + skip_memory_metrics = True, + use_legacy_prediction_loop = False, + push_to_hub = False, + resume_from_checkpoint = None, + hub_model_id = None, + hub_strategy = 'every_save', + hub_token = None, + hub_private_repo = None, + hub_always_push = False, + hub_revision = None, + gradient_checkpointing = False, + gradient_checkpointing_kwargs = None, + include_inputs_for_metrics = False, + eval_do_concat_batches = True, + fp16_backend = 'auto', + push_to_hub_model_id = None, + push_to_hub_organization = None, + push_to_hub_token = None, + mp_parameters = '', + auto_find_batch_size = False, + full_determinism = False, + torchdynamo = None, + ray_scope = 'last', + ddp_timeout = 1800, + torch_compile = False, + torch_compile_backend = None, + torch_compile_mode = None, + include_tokens_per_second = False, + include_num_input_tokens_seen = False, + neftune_noise_alpha = None, + optim_target_modules = None, + batch_eval_metrics = False, + eval_on_start = False, + use_liger_kernel = False, + liger_kernel_config = None, + eval_use_gather_object = False, + average_tokens_across_devices = False, + reward_model_path = None, + judge = None, + max_new_tokens = 64, + max_length = 512, + temperature = 0.9, + missing_eos_penalty = None, + loss_type = 'sigmoid', + dataset_num_proc = None, + disable_dropout = True, + use_vllm = False, + gpu_memory_utilization = 0.55, + ds3_gather_for_generation = True, + vllm_sampling_params = None, + unsloth_num_chunks = -1, + **kwargs, + ): + if learning_rate < 1e-7: raise FloatingPointError(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!') + if learning_rate > 1: raise OverflowError(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!') + if output_dir is None and save_strategy == 'steps' and save_steps == 500: + output_dir = 'unsloth_training_checkpoints' + save_strategy = 'no' + if dataset_num_proc is None: + from multiprocessing import cpu_count + dataset_num_proc = cpu_count() + if temperature <= 0: + raise MathError('Unsloth: Please set a positive non-zero temperature since your results will be wrong.') + elif temperature >= 10: + raise MathError('Unsloth: Please set a positive non-zero temperature less than 10, since sampling will be quite erratic.') + + + super().__init__( + output_dir = output_dir, + overwrite_output_dir = overwrite_output_dir, + do_train = do_train, + do_eval = do_eval, + do_predict = do_predict, + eval_strategy = eval_strategy, + prediction_loss_only = prediction_loss_only, + per_device_train_batch_size = per_device_train_batch_size, + per_device_eval_batch_size = per_device_eval_batch_size, + per_gpu_train_batch_size = per_gpu_train_batch_size, + per_gpu_eval_batch_size = per_gpu_eval_batch_size, + gradient_accumulation_steps = gradient_accumulation_steps, + eval_accumulation_steps = eval_accumulation_steps, + eval_delay = eval_delay, + torch_empty_cache_steps = torch_empty_cache_steps, + learning_rate = learning_rate, + weight_decay = weight_decay, + adam_beta1 = adam_beta1, + adam_beta2 = adam_beta2, + adam_epsilon = adam_epsilon, + max_grad_norm = max_grad_norm, + num_train_epochs = num_train_epochs, + max_steps = max_steps, + lr_scheduler_type = lr_scheduler_type, + warmup_ratio = warmup_ratio, + warmup_steps = warmup_steps, + log_level = log_level, + log_level_replica = log_level_replica, + log_on_each_node = log_on_each_node, + logging_dir = logging_dir, + logging_strategy = logging_strategy, + logging_first_step = logging_first_step, + logging_steps = logging_steps, + logging_nan_inf_filter = logging_nan_inf_filter, + save_strategy = save_strategy, + save_steps = save_steps, + save_total_limit = save_total_limit, + save_safetensors = save_safetensors, + save_on_each_node = save_on_each_node, + save_only_model = save_only_model, + restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint, + no_cuda = no_cuda, + use_cpu = use_cpu, + use_mps_device = use_mps_device, + seed = seed, + data_seed = data_seed, + jit_mode_eval = jit_mode_eval, + use_ipex = use_ipex, + bf16 = bf16, + fp16 = fp16, + fp16_opt_level = fp16_opt_level, + half_precision_backend = half_precision_backend, + bf16_full_eval = bf16_full_eval, + fp16_full_eval = fp16_full_eval, + tf32 = tf32, + local_rank = local_rank, + ddp_backend = ddp_backend, + tpu_num_cores = tpu_num_cores, + tpu_metrics_debug = tpu_metrics_debug, + debug = debug, + dataloader_drop_last = dataloader_drop_last, + eval_steps = eval_steps, + dataloader_num_workers = dataloader_num_workers, + dataloader_prefetch_factor = dataloader_prefetch_factor, + past_index = past_index, + run_name = run_name, + disable_tqdm = disable_tqdm, + remove_unused_columns = remove_unused_columns, + label_names = label_names, + load_best_model_at_end = load_best_model_at_end, + metric_for_best_model = metric_for_best_model, + greater_is_better = greater_is_better, + ignore_data_skip = ignore_data_skip, + fsdp = fsdp, + fsdp_min_num_params = fsdp_min_num_params, + fsdp_config = fsdp_config, + fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap, + accelerator_config = accelerator_config, + deepspeed = deepspeed, + label_smoothing_factor = label_smoothing_factor, + optim = optim, + optim_args = optim_args, + adafactor = adafactor, + group_by_length = group_by_length, + length_column_name = length_column_name, + report_to = report_to, + ddp_find_unused_parameters = ddp_find_unused_parameters, + ddp_bucket_cap_mb = ddp_bucket_cap_mb, + ddp_broadcast_buffers = ddp_broadcast_buffers, + dataloader_pin_memory = dataloader_pin_memory, + dataloader_persistent_workers = dataloader_persistent_workers, + skip_memory_metrics = skip_memory_metrics, + use_legacy_prediction_loop = use_legacy_prediction_loop, + push_to_hub = push_to_hub, + resume_from_checkpoint = resume_from_checkpoint, + hub_model_id = hub_model_id, + hub_strategy = hub_strategy, + hub_token = hub_token, + hub_private_repo = hub_private_repo, + hub_always_push = hub_always_push, + hub_revision = hub_revision, + gradient_checkpointing = gradient_checkpointing, + gradient_checkpointing_kwargs = gradient_checkpointing_kwargs, + include_inputs_for_metrics = include_inputs_for_metrics, + eval_do_concat_batches = eval_do_concat_batches, + fp16_backend = fp16_backend, + push_to_hub_model_id = push_to_hub_model_id, + push_to_hub_organization = push_to_hub_organization, + push_to_hub_token = push_to_hub_token, + mp_parameters = mp_parameters, + auto_find_batch_size = auto_find_batch_size, + full_determinism = full_determinism, + torchdynamo = torchdynamo, + ray_scope = ray_scope, + ddp_timeout = ddp_timeout, + torch_compile = torch_compile, + torch_compile_backend = torch_compile_backend, + torch_compile_mode = torch_compile_mode, + include_tokens_per_second = include_tokens_per_second, + include_num_input_tokens_seen = include_num_input_tokens_seen, + neftune_noise_alpha = neftune_noise_alpha, + optim_target_modules = optim_target_modules, + batch_eval_metrics = batch_eval_metrics, + eval_on_start = eval_on_start, + use_liger_kernel = use_liger_kernel, + liger_kernel_config = liger_kernel_config, + eval_use_gather_object = eval_use_gather_object, + average_tokens_across_devices = average_tokens_across_devices, + reward_model_path = reward_model_path, + judge = judge, + max_new_tokens = max_new_tokens, + max_length = max_length, + temperature = temperature, + missing_eos_penalty = missing_eos_penalty, + loss_type = loss_type, + dataset_num_proc = dataset_num_proc, + disable_dropout = disable_dropout, + use_vllm = use_vllm, + gpu_memory_utilization = gpu_memory_utilization, + ds3_gather_for_generation = ds3_gather_for_generation,**kwargs) + self.vllm_sampling_params = vllm_sampling_params + self.unsloth_num_chunks = unsloth_num_chunks +pass + +class _UnslothNashMDTrainer(OnlineDPOTrainer): + r"""""" + + _tag_names = ["trl", "nash-md"] + + def __init__( + self, + model: Union[PreTrainedModel, nn.Module] = None, + ref_model: Union[PreTrainedModel, nn.Module] = None, + reward_model: Union[PreTrainedModel, nn.Module, None] = None, + judge: Optional[BasePairwiseJudge] = None, + args: Optional[NashMDConfig] = None, + data_collator: Optional[Callable] = None, + train_dataset: Optional[Union[Dataset, IterableDataset]] = None, + eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None, + processing_class: Optional[ + Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin] + ] = None, + peft_config: Optional[dict] = None, + compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None, + callbacks: Optional[list[TrainerCallback]] = None, + optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), + preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, + ) -> None: + super().__init__( + model=model, + ref_model=ref_model, + reward_model=reward_model, + judge=judge, + args=args, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + processing_class=processing_class, + reward_processing_class=processing_class, # for now, NashMDTrainer can't use any reward model + peft_config=peft_config, + compute_metrics=compute_metrics, + callbacks=callbacks, + optimizers=optimizers, + preprocess_logits_for_metrics=preprocess_logits_for_metrics, + ) + + self._mixture_coef = self.args.mixture_coef + + # Overwrite the stats dictionary to include NashMD specific statistics + self.stats = { + # Remove "non_score_reward", "rlhf_reward", "scores_margin" + # Add "mixture_coef" + "loss/kl": [], + "objective/entropy": [], + "loss/score": [], + "rewards/probabilities": [], + "rewards/accuracies": [], + "rewards/margins": [], + "logps/chosen": [], + "logps/rejected": [], + "val/model_contain_eos_token": [], + "val/ref_contain_eos_token": [], + "beta": [], + "mixture_coef": [], + } + if self.reward_model is not None: + self.stats["rewards/chosen"] = [] + self.stats["rewards/rejected"] = [] + + @property + def mixture_coef(self): + if isinstance(self._mixture_coef, list): + epoch = self.state.epoch + return self._mixture_coef[epoch] if epoch < len(self._mixture_coef) else self._mixture_coef[-1] + else: + return self._mixture_coef + + def _generate_completions(self, model, prompts): + # Generate completions from the policy model. + with unwrap_model_for_generation(model, self.accelerator) as unwrapped_policy_for_gen_ctx: + model_output = unwrapped_policy_for_gen_ctx.generate( + input_ids=prompts["input_ids"], + attention_mask=prompts["attention_mask"], + generation_config=self.generation_config, + ) + + # Get the DDP/FSDP unwrapped version of the main model. + # This will be the policy model for GeometricMixtureWrapper (PEFT adapters active if PEFT is used). + policy_model_for_gmw = self.accelerator.unwrap_model(model) + + # Determine the correct reference model for GeometricMixtureWrapper. + # This also needs to be DDP/FSDP unwrapped. + ref_model_for_gmw: torch.nn.Module + if self.ref_model is None: + # No explicit ref_model is provided. + # Use the base of the main `model` if it's a PEFT model. + # policy_model_for_gmw is already DDP-unwrapped. + if is_peft_available() and isinstance(policy_model_for_gmw, PeftModel): + ref_model_for_gmw = policy_model_for_gmw.get_base_model() + else: + # Not a PEFT model (or PEFT not available), or already a base model. + # Use the DDP-unwrapped policy model itself as the reference. + ref_model_for_gmw = policy_model_for_gmw + else: + # An explicit ref_model is provided. Unwrap it for DDP/FSDP. + ref_model_for_gmw = self.accelerator.unwrap_model(self.ref_model) + + # Both models given to GeometricMixtureWrapper (policy_model_for_gmw and ref_model_for_gmw) are DDP-unwrapped. + with torch.no_grad(): # Ensure no_grad context for mixture model generation + mixture_model = GeometricMixtureWrapper( + model=policy_model_for_gmw, + ref_model=ref_model_for_gmw, + generation_config=self.generation_config, + mixture_coef=self.mixture_coef, + device=self.accelerator.device, + ) + + mixture_output = mixture_model.generate( + input_ids=prompts["input_ids"], + attention_mask=prompts["attention_mask"], + generation_config=self.generation_config, + ) + + return model_output, mixture_output + + def _process_completions(self, model_output, mixture_output, prompts): + context_length = prompts["input_ids"].shape[1] + + # Process model completions + model_completion_ids = model_output[:, context_length:] + model_completion_ids, model_completion_mask = truncate_right( + model_completion_ids, self.processing_class.eos_token_id, self.processing_class.pad_token_id + ) + model_data = { + "input_ids": torch.cat((prompts["input_ids"], model_completion_ids), dim=1), + "attention_mask": torch.cat((prompts["attention_mask"], model_completion_mask), dim=1), + "raw": prompts["raw"], + } + + # Process reference model completions + mixture_completion_ids = mixture_output[:, context_length:] + mixture_completion_ids, mixture_completion_mask = truncate_right( + mixture_completion_ids, self.processing_class.eos_token_id, self.processing_class.pad_token_id + ) + mixture_data = { + "input_ids": torch.cat((prompts["input_ids"], mixture_completion_ids), dim=1), + "attention_mask": torch.cat((prompts["attention_mask"], mixture_completion_mask), dim=1), + "raw": prompts["raw"], + } + + return model_data, mixture_data + + def _compute_rewards(self, model_data, mixture_data, context_length): + with torch.no_grad(): + _, model_scores, _ = get_reward( + self.reward_model, model_data["input_ids"], self.processing_class.pad_token_id, context_length + ) + _, mixture_scores, _ = get_reward( + self.reward_model, mixture_data["input_ids"], self.processing_class.pad_token_id, context_length + ) + + # Apply EOS penalty if needed + if self.args.missing_eos_penalty is not None: + model_contain_eos = torch.any(model_data["input_ids"] == self.processing_class.eos_token_id, dim=-1) + mixture_contain_eos = torch.any(mixture_data["input_ids"] == self.processing_class.eos_token_id, dim=-1) + model_scores[~model_contain_eos] -= self.args.missing_eos_penalty + mixture_scores[~mixture_contain_eos] -= self.args.missing_eos_penalty + + return model_scores, mixture_scores + + def _compute_judge(self, model_data, mixture_data, context_length): + prompts = model_data["raw"] + model_data_completions = self.processing_class.batch_decode( + model_data["input_ids"][:, context_length:], skip_special_tokens=True + ) + model_data_completions = [completion.strip() for completion in model_data_completions] + + mixture_data_completions = self.processing_class.batch_decode( + mixture_data["input_ids"][:, context_length:], skip_special_tokens=True + ) + mixture_data_completions = [completion.strip() for completion in mixture_data_completions] + if is_conversational({"prompt": prompts[0]}): + model_data_completions = [ + [{"role": "assistant", "content": completion}] for completion in model_data_completions + ] + environment = jinja2.Environment() + template = environment.from_string(SIMPLE_CHAT_TEMPLATE) + prompts = [template.render(messages=message) for message in prompts] + model_data_completions = [template.render(messages=completion) for completion in model_data_completions] + + mixture_data_completions = [ + [{"role": "assistant", "content": completion}] for completion in mixture_data_completions + ] + mixture_data_completions = [ + template.render(messages=completion) for completion in mixture_data_completions + ] + + probability = self.judge.judge( + prompts, + list(zip(model_data_completions, mixture_data_completions)), + return_scores=True, + ) + return torch.tensor(probability, device=model_data["input_ids"].device) + + def _compute_logprobs(self, model, model_data, context_length): + def compute_logprobs_for_data(m, data): + output = m(data["input_ids"], attention_mask=data["attention_mask"]) + logits = output.logits[:, context_length - 1 : -1] + token_logprobs = selective_log_softmax(logits, data["input_ids"][:, context_length:]) + return token_logprobs + + # Compute logprobs for model completions under the model + model_logprobs_model_data = compute_logprobs_for_data(model, model_data) + + # Compute logprobs of model completions under the reference model + with torch.no_grad(): + if self.ref_model is None: + with model.disable_adapter(): + ref_logprobs_model_data = compute_logprobs_for_data(model, model_data) + else: + ref_logprobs_model_data = compute_logprobs_for_data(self.ref_model, model_data) + + # Mask padding tokens + model_padding_mask = model_data["attention_mask"][:, context_length:] == 0 + model_logprobs_model_data = model_logprobs_model_data.masked_fill(model_padding_mask, 0.0) + ref_logprobs_model_data = ref_logprobs_model_data.masked_fill(model_padding_mask, 0.0) + + return (model_logprobs_model_data, ref_logprobs_model_data) + + def _compute_losses( + self, + model_logprobs_model_data, + ref_logprobs_model_data, + probability, + ): + # reinforce score where 0.5 is a control variate + score = (probability - 0.5) * model_logprobs_model_data.sum(1) + + # kl divergence via reinforce + with torch.no_grad(): + log_ratio = model_logprobs_model_data - ref_logprobs_model_data + kl_div_log = log_ratio.sum(1) + kl_div_loss = (log_ratio * model_logprobs_model_data).sum(1) + + # final loss + loss = self.beta * kl_div_loss - score + + return loss.mean(), score, kl_div_log + + def _log_statistics( + self, + model_data, + mixture_data, + model_logprobs_model_data, + ref_logprobs_model_data, + probability, + score, + kl_div, + context_length, + model_scores=None, + mixture_scores=None, + ): + # Helper function to gather and compute mean + def gather_mean(tensor): + return self.accelerator.gather_for_metrics(tensor).mean().item() + + # Log score + self.stats["loss/score"].append(gather_mean(score)) + # Log KL divergence + self.stats["loss/kl"].append(gather_mean(kl_div)) + + # Log logprobs + model_logprobs_model_data_sum = model_logprobs_model_data.sum(1) + ref_logprobs_model_data_sum = ref_logprobs_model_data.sum(1) + + self.stats["logps/chosen"].append(gather_mean(model_logprobs_model_data_sum)) + self.stats["logps/rejected"].append(gather_mean(ref_logprobs_model_data_sum)) + + # Log rewards + if self.reward_model is not None: + self.stats["rewards/chosen"].append(gather_mean(model_scores)) + self.stats["rewards/rejected"].append(gather_mean(mixture_scores)) + + # Log probabilities + self.stats["rewards/probabilities"].append(gather_mean(probability)) + + # Calculate entropy for model data + entropy_model_data = -model_logprobs_model_data.sum(1) + self.stats["objective/entropy"].append(gather_mean(entropy_model_data)) + + # Calculate margins + margin = model_logprobs_model_data_sum - ref_logprobs_model_data_sum + self.stats["rewards/margins"].append(gather_mean(margin)) + + # Calculate accuracy + accuracy = (margin > 0).float() + self.stats["rewards/accuracies"].append(gather_mean(accuracy)) + + # Log EOS token statistics + model_eos = (model_data["input_ids"][:, context_length:] == self.processing_class.eos_token_id).any(dim=1) + mixture_eos = (mixture_data["input_ids"][:, context_length:] == self.processing_class.eos_token_id).any(dim=1) + self.stats["val/model_contain_eos_token"].append(gather_mean(model_eos.float())) + self.stats["val/ref_contain_eos_token"].append(gather_mean(mixture_eos.float())) + + # Log beta and mixture coef + self.stats["beta"].append(self.beta) + self.stats["mixture_coef"].append(self.mixture_coef) + + def training_step( + self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], num_items_in_batch: Optional[int] = None + ) -> torch.Tensor: + model.train() + + # Apply chat template and tokenize the input + batch_size = len(next(iter(inputs.values()))) + prompts = inputs["prompt"] + inputs = [{k: v[i] for k, v in inputs.items()} for i in range(batch_size)] + inputs = [maybe_apply_chat_template(x, self.processing_class) for x in inputs] + inputs = [self.tokenize_row(x, self.model.config.is_encoder_decoder, self.processing_class) for x in inputs] + inputs = self.data_collator(inputs) + + # need the prompt_ only + inputs = self._prepare_inputs(inputs) + context_length = inputs["prompt_input_ids"].shape[1] + prompts = { + "input_ids": inputs["prompt_input_ids"], + "attention_mask": inputs["prompt_attention_mask"], + "raw": prompts, + } + del inputs + + # Sample completions from both the model and the reference model + model_output, mixture_output = self._generate_completions(model, prompts) + + # Process model completions + model_data, mixture_data = self._process_completions(model_output, mixture_output, prompts) + + # Compute rewards + if self.reward_model is not None: + model_scores, mixture_scores = self._compute_rewards(model_data, mixture_data, context_length) + # probability of the model data vs the mixture data + probability = F.sigmoid(model_scores - mixture_scores) + else: + model_scores, mixture_scores = None, None + probability = self._compute_judge(model_data, mixture_data, context_length) + + # Compute logprobs + model_logprobs_model_data, ref_logprobs_model_data = self._compute_logprobs(model, model_data, context_length) + + # Compute loss + loss, score, kl_div = self._compute_losses(model_logprobs_model_data, ref_logprobs_model_data, probability) + + # Log everything + self._log_statistics( + model_data, + mixture_data, + model_logprobs_model_data.detach(), + ref_logprobs_model_data, + probability, + score.detach(), + kl_div.detach(), + context_length, + model_scores, + mixture_scores, + ) + + if ( + self.args.torch_empty_cache_steps is not None + and self.state.global_step % self.args.torch_empty_cache_steps == 0 + ): + empty_cache() + + kwargs = {} + # For LOMO optimizers you need to explicitly use the learning rate + if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]: + kwargs["learning_rate"] = self._get_learning_rate() + + if self.args.n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu parallel training + + if self.use_apex: + with amp.scale_loss(loss, self.optimizer) as scaled_loss: + scaled_loss.backward() + else: + self.accelerator.backward(loss, **kwargs) + + return loss.detach() / self.args.gradient_accumulation_steps + + def create_model_card( + self, + model_name: Optional[str] = None, + dataset_name: Optional[str] = None, + tags: Union[str, list[str], None] = None, + ): + """ + Creates a draft of a model card using the information available to the `Trainer`. + + Args: + model_name (`str` or `None`, *optional*, defaults to `None`): + Name of the model. + dataset_name (`str` or `None`, *optional*, defaults to `None`): + Name of the dataset used for training. + tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`): + Tags to be associated with the model card. + """ + if not self.is_world_process_zero(): + return + + if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path): + base_model = self.model.config._name_or_path + else: + base_model = None + + # normalize `tags` to a mutable set + if tags is None: + tags = set() + elif isinstance(tags, str): + tags = {tags} + else: + tags = set(tags) + + if hasattr(self.model.config, "unsloth_version"): + tags.add("unsloth") + + tags.update(self._tag_names) + + citation = textwrap.dedent("""\ + @inproceedings{munos2024nash, + title = {{Nash Learning from Human Feedback}}, + author = {R{\'{e}}mi Munos and Michal Valko and Daniele Calandriello and Mohammad Gheshlaghi Azar and Mark Rowland and Zhaohan Daniel Guo and Yunhao Tang and Matthieu Geist and Thomas Mesnard and C{\\^{o}}me Fiegel and Andrea Michi and Marco Selvi and Sertan Girgin and Nikola Momchev and Olivier Bachem and Daniel J. Mankowitz and Doina Precup and Bilal Piot}, + year = 2024, + booktitle = {Forty-first International Conference on Machine Learning, {ICML} 2024, Vienna, Austria, July 21-27, 2024}, + publisher = {OpenReview.net}, + url = {https://openreview.net/forum?id=Y5AmNYiyCQ} + }""") + + model_card = generate_model_card( + base_model=base_model, + model_name=model_name, + hub_model_id=self.hub_model_id, + dataset_name=dataset_name, + tags=tags, + wandb_url=wandb.run.get_url() if is_wandb_available() and wandb.run is not None else None, + comet_url=get_comet_experiment_url(), + trainer_name="Nash-MD", + trainer_citation=citation, + paper_title="Nash Learning from Human Feedback", + paper_id="2312.00886", + ) + + model_card.save(os.path.join(self.args.output_dir, "README.md")) +class UnslothNashMDTrainer(_UnslothNashMDTrainer): + """ + + Initialize NashMDTrainer as a subclass of [`OnlineDPOConfig`]. + + Args: + model (`transformers.PreTrainedModel`): + The model to train, preferably an `AutoModelForCausalLM`. + ref_model (`PreTrainedModelWrapper`): + Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation + and loss. If no reference model is provided, the trainer will create a reference model with the same + architecture as the model to be optimized. + reward_model (`transformers.PreTrainedModel`): + The reward model to score completions with, preferably an `AutoModelForSequenceClassification`. + judge (`BasePairwiseJudge`): + The judge to use for pairwise comparison of model completions. + args (`NashMDConfig`): + The NashMD config arguments to use for training. + data_collator (`transformers.DataCollator`): + The data collator to use for training. If None is specified, the default data collator + (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the + sequences in the batch, given a dataset of paired sequences. + train_dataset (`datasets.Dataset`): + The dataset to use for training. + eval_dataset (`datasets.Dataset`): + The dataset to use for evaluation. + processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*): + Processing class used to process the data. If provided, will be used to automatically process the inputs + for the model, and it will be saved along the model to make it easier to rerun an interrupted training or + reuse the fine-tuned model. + peft_config (`dict`): + The peft config to use for training. + compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*): + The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to + metric values. + callbacks (`list[transformers.TrainerCallback]`): + The callbacks to use for training. + optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`): + The optimizer and scheduler to use for training. + preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`): + The function to use to preprocess the logits before computing the metrics. + + """ + def __init__( + self, + model = None, + ref_model = None, + reward_model = None, + judge = None, + args = None, + data_collator = None, + train_dataset = None, + eval_dataset = None, + processing_class = None, + peft_config = None, + compute_metrics = None, + callbacks = None, + preprocess_logits_for_metrics = None, + **kwargs + ): + if args is None: args = UnslothNashMDConfig() + use_bf16 = getattr(args, 'bf16', False) + if type(use_bf16) is not bool: use_bf16 = False + use_fp16 = getattr(args, 'fp16', False) + if type(use_fp16) is not bool: use_fp16 = False + force_float32 = False + if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1': + print('Unsloth: Switching to float32 training since model cannot work with float16') + force_float32 = True + mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') + dtype = getattr(model.config, 'torch_dtype', None) + if dtype is None: dtype = model.get_input_embeddings().dtype + from unsloth_zoo.utils import _get_dtype + dtype = _get_dtype(dtype) + float16 = dtype == torch.float16 + if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`') + if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`') + if force_float32: + args.fp16 = False + args.bf16 = False + os.environ['ACCELERATE_MIXED_PRECISION'] = 'no' + elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32': + args.fp16 = float16 + args.bf16 = not float16 + os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16' + if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no': + args.eval_strategy = 'steps' + if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1 + ga_steps = getattr(args, 'gradient_accumulation_steps', None) + if ga_steps is not None and ga_steps > 1: + from transformers import __version__ as transformers_version + if Version(transformers_version) <= Version('4.45.2'): + print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n' + '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`') + if getattr(args, 'eval_strategy', 'no') != 'no': + eval_bsz = getattr(args, 'per_device_eval_batch_size', 8) + if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size + if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps + fp16_full_eval = getattr(args, 'fp16_full_eval', False) + if type(fp16_full_eval) is not bool: fp16_full_eval = False + bf16_full_eval = getattr(args, 'bf16_full_eval', False) + if type(bf16_full_eval) is not bool: bf16_full_eval = False + if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True + if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False + if force_float32: + args.bf16_full_eval = False + args.fp16_full_eval = False + elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16': + args.bf16_full_eval = True + args.fp16_full_eval = False + elif not bf16_full_eval and not fp16_full_eval: + args.bf16_full_eval = args.bf16 + args.fp16_full_eval = args.fp16 + _output_logits = False + if locals().get('compute_metrics', None) is not None: _output_logits = True + if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True + if _output_logits: + os.environ['UNSLOTH_RETURN_LOGITS'] = '1' + if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'): + pass + else: + model_max_seq_length = getattr(model, 'max_seq_length', None) + args_max_seq_length = getattr(args, 'max_seq_length', None) + if args_max_seq_length is None and model_max_seq_length is not None: + max_seq_length = model.max_seq_length + if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length + if model is not None and hasattr(model, 'for_training'): + model.for_training() + if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right' + if 'processing_class' in locals(): + if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right' + if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right' + __tokenizer = processing_class if 'processing_class' in locals() else tokenizer + from unsloth_zoo.vision_utils import UnslothVisionDataCollator + if not isinstance(data_collator, UnslothVisionDataCollator): + if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer, mlm = False, mlm_probability = 0.0) + elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names: + data_collator = DataCollatorForSeq2Seq(__tokenizer) + else: + if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False + if hasattr(args, 'dataset_text_field'): args.dataset_text_field = '' + if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True} + if not isinstance(data_collator, UnslothVisionDataCollator): + if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'): + if isinstance(data_collator, DataCollatorForSeq2Seq): + data_collator = DataCollatorForSeq2Seq(__tokenizer.tokenizer) + else: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer.tokenizer, mlm = False, mlm_probability = 0.0) + other_metrics = [] + + from unsloth_zoo.logging_utils import PatchRLStatistics + PatchRLStatistics('nash_md_trainer', other_metrics) + + super().__init__( + model = model, + ref_model = ref_model, + reward_model = reward_model, + judge = judge, + args = args, + data_collator = data_collator, + train_dataset = train_dataset, + eval_dataset = eval_dataset, + processing_class = processing_class, + peft_config = peft_config, + compute_metrics = compute_metrics, + callbacks = callbacks, + preprocess_logits_for_metrics = preprocess_logits_for_metrics,**kwargs) + if hasattr(self, 'neftune_hook_handle'): + self.neftune_hook_handle.remove() + if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle + if getattr(args, 'neftune_noise_alpha', None) is not None: + model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha + pass + +pass diff --git a/unsloth_compiled_cache/UnslothORPOTrainer.py b/unsloth_compiled_cache/UnslothORPOTrainer.py new file mode 100644 index 0000000000000000000000000000000000000000..f35e4c02a022bc06b6c19a0540d86b87f3a18ef3 --- /dev/null +++ b/unsloth_compiled_cache/UnslothORPOTrainer.py @@ -0,0 +1,1547 @@ +""" +2025.7.4 +2025.7.3 +4.53.2 +0.19.1 +__UNSLOTH_VERSIONING__ +""" +from torch import Tensor +import torch +import torch.nn as nn +from torch.nn import functional as F +from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable +from trl.trainer.orpo_trainer import (Any, AutoModelForCausalLM, BaseImageProcessor, Callable, DPODataCollatorWithPadding, DataCollator, DataLoader, Dataset, EvalLoopOutput, F, FeatureExtractionMixin, Literal, ORPOConfig, ORPOTrainer, Optional, PartialState, Path, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, Trainer, TrainerCallback, Union, add_bos_token_if_needed, add_eos_token_if_needed, autocast, defaultdict, disable_dropout_in_model, generate_model_card, get_comet_experiment_url, inspect, is_comet_available, is_peft_available, is_torch_fx_proxy, is_torch_xla_available, is_wandb_available, log_table_to_comet_experiment, maybe_apply_chat_template, maybe_extract_prompt, nn, np, nullcontext, os, pad_to_length, pd, peft_module_casting_to_bf16, prepare_model_for_kbit_training, random, textwrap, torch, wandb, warnings, F, Optional, PeftModel, PreTrainedModel, Trainer, is_peft_available, os, torch) + + +import os +from typing import * +from dataclasses import dataclass, field +from packaging.version import Version +import torch +import numpy as np +from contextlib import nullcontext +from torch.nn import functional as F +from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling + +torch_compile_options = { + "epilogue_fusion" : True, + "max_autotune" : False, + "shape_padding" : True, + "trace.enabled" : False, + "triton.cudagraphs" : False, +} + +@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) +def selective_log_softmax(logits, index): + logits = logits.to(torch.float32) + selected_logits = torch.gather(logits, dim = -1, index = index.unsqueeze(-1)).squeeze(-1) + # loop to reduce peak mem consumption + # logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits]) + logsumexp_values = torch.logsumexp(logits, dim = -1) + per_token_logps = selected_logits - logsumexp_values # log_softmax(x_i) = x_i - logsumexp(x) + return per_token_logps +@dataclass +class UnslothORPOConfig(ORPOConfig): + """ + + Configuration class for the [`ORPOTrainer`]. + + This class includes only the parameters that are specific to ORPO training. For a full list of training arguments, + please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may + differ from those in [`~transformers.TrainingArguments`]. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + max_length (`int` or `None`, *optional*, defaults to `1024`): + Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want + to use the default data collator. + max_prompt_length (`int` or `None`, *optional*, defaults to `512`): + Maximum length of the prompt. This argument is required if you want to use the default data collator. + max_completion_length (`int` or `None`, *optional*, defaults to `None`): + Maximum length of the completion. This argument is required if you want to use the default data collator + and your model is an encoder-decoder. + beta (`float`, *optional*, defaults to `0.1`): + Parameter controlling the relative ratio loss weight in the ORPO loss. In the + [paper](https://huggingface.co/papers/2403.07691), it is denoted by λ. In the + [code](https://github.com/xfactlab/orpo), it is denoted by `alpha`. + disable_dropout (`bool`, *optional*, defaults to `True`): + Whether to disable dropout in the model. + label_pad_token_id (`int`, *optional*, defaults to `-100`): + Label pad token id. This argument is required if you want to use the default data collator. + padding_value (`int` or `None`, *optional*, defaults to `None`): + Padding value to use. If `None`, the padding value of the tokenizer is used. + truncation_mode (`str`, *optional*, defaults to `"keep_end"`): + Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`. + This argument is required if you want to use the default data collator. + generate_during_eval (`bool`, *optional*, defaults to `False`): + If `True`, generates and logs completions from the model to W&B or Comet during evaluation. + is_encoder_decoder (`bool` or `None`, *optional*, defaults to `None`): + When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument, + you need to specify if the model returned by the callable is an encoder-decoder model. + model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): + Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a + string. + dataset_num_proc (`int` or `None`, *optional*, defaults to `None`): + Number of processes to use for processing the dataset. + + """ + vllm_sampling_params: Optional[Any] = field( + default = None, + metadata = {'help': 'vLLM SamplingParams'}, + ) + unsloth_num_chunks : Optional[int] = field( + default = -1, + metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'}, + ) + def __init__( + self, + output_dir = None, + overwrite_output_dir = None, + do_train = False, + do_eval = False, + do_predict = False, + eval_strategy = 'no', + prediction_loss_only = False, + per_device_train_batch_size = 4, + per_device_eval_batch_size = 4, + per_gpu_train_batch_size = None, + per_gpu_eval_batch_size = None, + gradient_accumulation_steps = 2, + eval_accumulation_steps = 2, + eval_delay = 0, + torch_empty_cache_steps = 250, + learning_rate = 5e-05, + weight_decay = 0.01, + adam_beta1 = 0.9, + adam_beta2 = 0.999, + adam_epsilon = 1e-08, + max_grad_norm = 1.0, + num_train_epochs = 3.0, + max_steps = -1, + lr_scheduler_type = 'linear', + warmup_ratio = 0.1, + warmup_steps = 0, + log_level = 'passive', + log_level_replica = 'warning', + log_on_each_node = True, + logging_dir = None, + logging_strategy = 'steps', + logging_first_step = False, + logging_steps = 1, + logging_nan_inf_filter = False, + save_strategy = 'steps', + save_steps = 500, + save_total_limit = None, + save_safetensors = True, + save_on_each_node = False, + save_only_model = False, + restore_callback_states_from_checkpoint = False, + no_cuda = False, + use_cpu = False, + use_mps_device = False, + seed = 3407, + data_seed = 3407, + jit_mode_eval = False, + use_ipex = False, + bf16 = False, + fp16 = False, + fp16_opt_level = 'O1', + half_precision_backend = 'auto', + bf16_full_eval = False, + fp16_full_eval = False, + tf32 = None, + local_rank = -1, + ddp_backend = None, + tpu_num_cores = None, + tpu_metrics_debug = False, + debug = '', + dataloader_drop_last = False, + eval_steps = None, + dataloader_num_workers = 0, + dataloader_prefetch_factor = None, + past_index = -1, + run_name = None, + disable_tqdm = None, + remove_unused_columns = True, + label_names = None, + load_best_model_at_end = False, + metric_for_best_model = None, + greater_is_better = None, + ignore_data_skip = False, + fsdp = '', + fsdp_min_num_params = 0, + fsdp_config = None, + fsdp_transformer_layer_cls_to_wrap = None, + accelerator_config = None, + deepspeed = None, + label_smoothing_factor = 0.0, + optim = 'adamw_8bit', + optim_args = None, + adafactor = False, + group_by_length = False, + length_column_name = 'length', + report_to = None, + ddp_find_unused_parameters = None, + ddp_bucket_cap_mb = None, + ddp_broadcast_buffers = None, + dataloader_pin_memory = True, + dataloader_persistent_workers = False, + skip_memory_metrics = True, + use_legacy_prediction_loop = False, + push_to_hub = False, + resume_from_checkpoint = None, + hub_model_id = None, + hub_strategy = 'every_save', + hub_token = None, + hub_private_repo = None, + hub_always_push = False, + hub_revision = None, + gradient_checkpointing = False, + gradient_checkpointing_kwargs = None, + include_inputs_for_metrics = False, + eval_do_concat_batches = True, + fp16_backend = 'auto', + push_to_hub_model_id = None, + push_to_hub_organization = None, + push_to_hub_token = None, + mp_parameters = '', + auto_find_batch_size = False, + full_determinism = False, + torchdynamo = None, + ray_scope = 'last', + ddp_timeout = 1800, + torch_compile = False, + torch_compile_backend = None, + torch_compile_mode = None, + include_tokens_per_second = False, + include_num_input_tokens_seen = False, + neftune_noise_alpha = None, + optim_target_modules = None, + batch_eval_metrics = False, + eval_on_start = False, + use_liger_kernel = False, + liger_kernel_config = None, + eval_use_gather_object = False, + average_tokens_across_devices = False, + max_length = 1024, + max_prompt_length = 512, + max_completion_length = None, + beta = 0.1, + disable_dropout = True, + label_pad_token_id = -100, + padding_value = None, + truncation_mode = 'keep_end', + generate_during_eval = False, + is_encoder_decoder = None, + model_init_kwargs = None, + dataset_num_proc = None, + vllm_sampling_params = None, + unsloth_num_chunks = -1, + **kwargs, + ): + if learning_rate < 1e-7: raise FloatingPointError(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!') + if learning_rate > 1: raise OverflowError(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!') + if output_dir is None and save_strategy == 'steps' and save_steps == 500: + output_dir = 'unsloth_training_checkpoints' + save_strategy = 'no' + if dataset_num_proc is None: + from multiprocessing import cpu_count + dataset_num_proc = cpu_count() + + super().__init__( + output_dir = output_dir, + overwrite_output_dir = overwrite_output_dir, + do_train = do_train, + do_eval = do_eval, + do_predict = do_predict, + eval_strategy = eval_strategy, + prediction_loss_only = prediction_loss_only, + per_device_train_batch_size = per_device_train_batch_size, + per_device_eval_batch_size = per_device_eval_batch_size, + per_gpu_train_batch_size = per_gpu_train_batch_size, + per_gpu_eval_batch_size = per_gpu_eval_batch_size, + gradient_accumulation_steps = gradient_accumulation_steps, + eval_accumulation_steps = eval_accumulation_steps, + eval_delay = eval_delay, + torch_empty_cache_steps = torch_empty_cache_steps, + learning_rate = learning_rate, + weight_decay = weight_decay, + adam_beta1 = adam_beta1, + adam_beta2 = adam_beta2, + adam_epsilon = adam_epsilon, + max_grad_norm = max_grad_norm, + num_train_epochs = num_train_epochs, + max_steps = max_steps, + lr_scheduler_type = lr_scheduler_type, + warmup_ratio = warmup_ratio, + warmup_steps = warmup_steps, + log_level = log_level, + log_level_replica = log_level_replica, + log_on_each_node = log_on_each_node, + logging_dir = logging_dir, + logging_strategy = logging_strategy, + logging_first_step = logging_first_step, + logging_steps = logging_steps, + logging_nan_inf_filter = logging_nan_inf_filter, + save_strategy = save_strategy, + save_steps = save_steps, + save_total_limit = save_total_limit, + save_safetensors = save_safetensors, + save_on_each_node = save_on_each_node, + save_only_model = save_only_model, + restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint, + no_cuda = no_cuda, + use_cpu = use_cpu, + use_mps_device = use_mps_device, + seed = seed, + data_seed = data_seed, + jit_mode_eval = jit_mode_eval, + use_ipex = use_ipex, + bf16 = bf16, + fp16 = fp16, + fp16_opt_level = fp16_opt_level, + half_precision_backend = half_precision_backend, + bf16_full_eval = bf16_full_eval, + fp16_full_eval = fp16_full_eval, + tf32 = tf32, + local_rank = local_rank, + ddp_backend = ddp_backend, + tpu_num_cores = tpu_num_cores, + tpu_metrics_debug = tpu_metrics_debug, + debug = debug, + dataloader_drop_last = dataloader_drop_last, + eval_steps = eval_steps, + dataloader_num_workers = dataloader_num_workers, + dataloader_prefetch_factor = dataloader_prefetch_factor, + past_index = past_index, + run_name = run_name, + disable_tqdm = disable_tqdm, + remove_unused_columns = remove_unused_columns, + label_names = label_names, + load_best_model_at_end = load_best_model_at_end, + metric_for_best_model = metric_for_best_model, + greater_is_better = greater_is_better, + ignore_data_skip = ignore_data_skip, + fsdp = fsdp, + fsdp_min_num_params = fsdp_min_num_params, + fsdp_config = fsdp_config, + fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap, + accelerator_config = accelerator_config, + deepspeed = deepspeed, + label_smoothing_factor = label_smoothing_factor, + optim = optim, + optim_args = optim_args, + adafactor = adafactor, + group_by_length = group_by_length, + length_column_name = length_column_name, + report_to = report_to, + ddp_find_unused_parameters = ddp_find_unused_parameters, + ddp_bucket_cap_mb = ddp_bucket_cap_mb, + ddp_broadcast_buffers = ddp_broadcast_buffers, + dataloader_pin_memory = dataloader_pin_memory, + dataloader_persistent_workers = dataloader_persistent_workers, + skip_memory_metrics = skip_memory_metrics, + use_legacy_prediction_loop = use_legacy_prediction_loop, + push_to_hub = push_to_hub, + resume_from_checkpoint = resume_from_checkpoint, + hub_model_id = hub_model_id, + hub_strategy = hub_strategy, + hub_token = hub_token, + hub_private_repo = hub_private_repo, + hub_always_push = hub_always_push, + hub_revision = hub_revision, + gradient_checkpointing = gradient_checkpointing, + gradient_checkpointing_kwargs = gradient_checkpointing_kwargs, + include_inputs_for_metrics = include_inputs_for_metrics, + eval_do_concat_batches = eval_do_concat_batches, + fp16_backend = fp16_backend, + push_to_hub_model_id = push_to_hub_model_id, + push_to_hub_organization = push_to_hub_organization, + push_to_hub_token = push_to_hub_token, + mp_parameters = mp_parameters, + auto_find_batch_size = auto_find_batch_size, + full_determinism = full_determinism, + torchdynamo = torchdynamo, + ray_scope = ray_scope, + ddp_timeout = ddp_timeout, + torch_compile = torch_compile, + torch_compile_backend = torch_compile_backend, + torch_compile_mode = torch_compile_mode, + include_tokens_per_second = include_tokens_per_second, + include_num_input_tokens_seen = include_num_input_tokens_seen, + neftune_noise_alpha = neftune_noise_alpha, + optim_target_modules = optim_target_modules, + batch_eval_metrics = batch_eval_metrics, + eval_on_start = eval_on_start, + use_liger_kernel = use_liger_kernel, + liger_kernel_config = liger_kernel_config, + eval_use_gather_object = eval_use_gather_object, + average_tokens_across_devices = average_tokens_across_devices, + max_length = max_length, + max_prompt_length = max_prompt_length, + max_completion_length = max_completion_length, + beta = beta, + disable_dropout = disable_dropout, + label_pad_token_id = label_pad_token_id, + padding_value = padding_value, + truncation_mode = truncation_mode, + generate_during_eval = generate_during_eval, + is_encoder_decoder = is_encoder_decoder, + model_init_kwargs = model_init_kwargs, + dataset_num_proc = dataset_num_proc,**kwargs) + self.vllm_sampling_params = vllm_sampling_params + self.unsloth_num_chunks = unsloth_num_chunks +pass + +class _UnslothORPOTrainer(Trainer): + r"""""" + + _tag_names = ["trl", "orpo"] + + def __init__( + self, + model: Optional[Union[PreTrainedModel, nn.Module, str]] = None, + args: Optional[ORPOConfig] = None, + data_collator: Optional[DataCollator] = None, + train_dataset: Optional[Dataset] = None, + eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None, + processing_class: Optional[ + Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin] + ] = None, + model_init: Optional[Callable[[], PreTrainedModel]] = None, + callbacks: Optional[list[TrainerCallback]] = None, + optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), + preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, + peft_config: Optional[dict] = None, + compute_metrics: Optional[Callable[[EvalLoopOutput], dict]] = None, + ): + if args.model_init_kwargs is None: + model_init_kwargs = {} + elif not isinstance(model, str): + raise ValueError("You passed model_kwargs to the ORPOTrainer. But your model is already instantiated.") + else: + model_init_kwargs = args.model_init_kwargs + torch_dtype = model_init_kwargs.get("torch_dtype") + if torch_dtype is not None: + # Convert to `torch.dtype` if an str is passed + if isinstance(torch_dtype, str) and torch_dtype != "auto": + torch_dtype = getattr(torch, torch_dtype) + if torch_dtype != "auto" and not isinstance(torch_dtype, torch.dtype): + raise ValueError( + f"Invalid `torch_dtype` passed to the ORPOConfig. Expected a string with either `torch.dtype` or 'auto', but got {torch_dtype}." + ) + model_init_kwargs["torch_dtype"] = torch_dtype + + if isinstance(model, str): + model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs) + + # Initialize this variable to False. This helps tracking the case when `peft_module_casting_to_bf16` + # has been called in order to properly call autocast if needed. + self._peft_has_been_casted_to_bf16 = False + + if not is_peft_available() and peft_config is not None: + raise ValueError( + "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models" + ) + elif is_peft_available() and peft_config is not None: + # if model is a peft model and we have a peft_config, we merge and unload it first + if isinstance(model, PeftModel): + model = model.merge_and_unload() + + if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False): + _support_gc_kwargs = hasattr( + args, "gradient_checkpointing_kwargs" + ) and "gradient_checkpointing_kwargs" in list( + inspect.signature(prepare_model_for_kbit_training).parameters + ) + + prepare_model_kwargs = {"use_gradient_checkpointing": args.gradient_checkpointing} + + if _support_gc_kwargs: + prepare_model_kwargs["gradient_checkpointing_kwargs"] = args.gradient_checkpointing_kwargs + + model = prepare_model_for_kbit_training(model, **prepare_model_kwargs) + elif args.gradient_checkpointing: + # For backward compatibility with older versions of transformers + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + else: + + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + # get peft model with the given config + model = model + if args.bf16 and getattr(model, "is_loaded_in_4bit", False): + peft_module_casting_to_bf16(model) + # If args.bf16 we need to explicitly call `generate` with torch amp autocast context manager + self._peft_has_been_casted_to_bf16 = True + + # For models that use gradient_checkpointing, we need to attach a hook that enables input + # to explicitly have `requires_grad=True`, otherwise training will either silently + # fail or completely fail. + elif args.gradient_checkpointing: + # For backward compatibility with older versions of transformers + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + else: + + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + if args.generate_during_eval and not (is_wandb_available() or is_comet_available()): + raise ValueError( + "`generate_during_eval=True` requires Weights and Biases or Comet to be installed." + " Please install `wandb` or `comet-ml` to resolve." + ) + + if model is not None: + self.is_encoder_decoder = model.config.is_encoder_decoder + elif args.is_encoder_decoder is None: + raise ValueError("When no model is provided, you need to pass the parameter is_encoder_decoder.") + else: + self.is_encoder_decoder = args.is_encoder_decoder + + if self.is_encoder_decoder: + self.decoder_start_token_id = model.config.decoder_start_token_id + self.pad_token_id = model.config.pad_token_id + + if processing_class is None: + raise ValueError("processing_class must be specified to tokenize a ORPO dataset.") + if args.max_length is None: + warnings.warn( + "`max_length` is not set in the ORPOConfig's init" + " it will default to `512` by default, but you should do it yourself in the future.", + UserWarning, + ) + max_length = 512 + else: + max_length = args.max_length + if args.max_prompt_length is None: + warnings.warn( + "`max_prompt_length` is not set in the ORPOConfig's init" + " it will default to `128` by default, but you should do it yourself in the future.", + UserWarning, + ) + max_prompt_length = 128 + else: + max_prompt_length = args.max_prompt_length + + if args.max_completion_length is None and self.is_encoder_decoder: + warnings.warn( + "When using an encoder decoder architecture, you should set `max_completion_length` in the ORPOConfig's init" + " it will default to `128` by default, but you should do it yourself in the future.", + UserWarning, + ) + self.max_completion_length = 128 + else: + self.max_completion_length = args.max_completion_length + + if data_collator is None: + data_collator = DPODataCollatorWithPadding( + pad_token_id=processing_class.pad_token_id, + label_pad_token_id=args.label_pad_token_id, + is_encoder_decoder=self.is_encoder_decoder, + ) + + if args.remove_unused_columns: + args.remove_unused_columns = False + # warn users + warnings.warn( + "When using DPODataCollatorWithPadding, you should set `remove_unused_columns=False` in your TrainingArguments" + " we have set it for you, but you should do it yourself in the future.", + UserWarning, + ) + + self.use_dpo_data_collator = True + else: + self.use_dpo_data_collator = False + + # Disable dropout in the model and reference model + if args.disable_dropout: + disable_dropout_in_model(model) + + self.max_length = max_length + self.generate_during_eval = args.generate_during_eval + self.label_pad_token_id = args.label_pad_token_id + self.padding_value = args.padding_value if args.padding_value is not None else processing_class.pad_token_id + self.max_prompt_length = max_prompt_length + self.truncation_mode = args.truncation_mode + self.processing_class = processing_class + + self.beta = args.beta + self.aux_loss_enabled = getattr(model.config, "output_router_logits", False) + self.aux_loss_coef = getattr(model.config, "router_aux_loss_coef", 0.0) + if self.aux_loss_enabled and self.aux_loss_coef == 0.0: + warnings.warn( + "You set `output_router_logits` to `True` in the model config, but `router_aux_loss_coef` is set to " + "`0.0`, meaning the auxiliary loss will not be used. Either set `router_aux_loss_coef` to a value " + "greater than `0.0`, or set `output_router_logits` to `False` if you don't want to use the auxiliary " + "loss.", + UserWarning, + ) + + self._stored_metrics = defaultdict(lambda: defaultdict(list)) + + # The trainer estimates the number of FLOPs [floating-point operations] using the number of elements in the + # input tensor associated with the key "input_ids". However, in ORPO, the sampled data does not include the + # "input_ids" key. Instead, the available keys are "prompt_input_ids", "chosen_input_ids", and + # "rejected_input_ids". As a result, the trainer issues the warning: "Could not estimate the number of tokens + # of the input, floating-point operations will not be computed." To suppress this warning, we set the + # "estimate_tokens" key in the model's "warnings_issued" dictionary to True. This acts as a flag to indicate + # that the warning has already been issued. + model.warnings_issued["estimate_tokens"] = True + + # Compute that only on the main process for faster data processing. + # see: https://github.com/huggingface/trl/pull/1255 + with PartialState().main_process_first(): + # Extract the prompt if needed, and apply the chat template if needed + train_dataset = train_dataset.map(maybe_extract_prompt, num_proc=args.dataset_num_proc) + train_dataset = train_dataset.map( + maybe_apply_chat_template, fn_kwargs={"tokenizer": processing_class}, num_proc=args.dataset_num_proc + ) + train_dataset = train_dataset.map(self.tokenize_row, num_proc=args.dataset_num_proc) + if eval_dataset is not None: + eval_dataset = eval_dataset.map(maybe_extract_prompt, num_proc=args.dataset_num_proc) + eval_dataset = eval_dataset.map( + maybe_apply_chat_template, + fn_kwargs={"tokenizer": processing_class}, + num_proc=args.dataset_num_proc, + ) + eval_dataset = eval_dataset.map(self.tokenize_row, num_proc=args.dataset_num_proc) + + super().__init__( + model=model, + args=args, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + processing_class=processing_class, + model_init=model_init, + compute_metrics=compute_metrics, + callbacks=callbacks, + optimizers=optimizers, + preprocess_logits_for_metrics=preprocess_logits_for_metrics, + ) + + # Add tags for models that have been loaded with the correct transformers version + if hasattr(self.model, "add_model_tags"): + self.model.add_model_tags(self._tag_names) + + if not hasattr(self, "accelerator"): + raise AttributeError( + "Your `Trainer` does not have an `accelerator` object. Consider upgrading `transformers`." + ) + + def build_tokenized_answer(self, prompt, answer): + """ + Llama tokenizer does satisfy `enc(a + b) = enc(a) + enc(b)`. It does ensure `enc(a + b) = enc(a) + enc(a + + b)[len(enc(a)):]`. Reference: + https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257 + """ + + full_tokenized = self.processing_class(prompt + answer, add_special_tokens=False) + prompt_input_ids = self.processing_class(prompt, add_special_tokens=False)["input_ids"] + + answer_input_ids = full_tokenized["input_ids"][len(prompt_input_ids) :] + answer_attention_mask = full_tokenized["attention_mask"][len(prompt_input_ids) :] + + # Concat tokens to form `enc(a) + enc(a + b)[len(enc(a)):]` + full_concat_input_ids = np.concatenate([prompt_input_ids, answer_input_ids]) + + # Prepare input tokens for token by token comparison + full_input_ids = np.array(full_tokenized["input_ids"]) + + if len(full_input_ids) != len(full_concat_input_ids): + raise ValueError("Prompt input ids and answer input ids should have the same length.") + + # On some tokenizers, like Llama-2 tokenizer, there are occasions where tokens + # can be merged together when tokenizing prompt+answer. This could result + # on the last token from the prompt being different when tokenized on its own + # vs when done as prompt+answer. + response_token_ids_start_idx = len(prompt_input_ids) + + # If tokenized prompt is different than both prompt+answer, then it means the + # last token has changed due to merging. + if prompt_input_ids != full_tokenized["input_ids"][:response_token_ids_start_idx]: + response_token_ids_start_idx -= 1 + + prompt_input_ids = full_tokenized["input_ids"][:response_token_ids_start_idx] + prompt_attention_mask = full_tokenized["attention_mask"][:response_token_ids_start_idx] + + if len(prompt_input_ids) != len(prompt_attention_mask): + raise ValueError("Prompt input ids and attention mask should have the same length.") + + answer_input_ids = full_tokenized["input_ids"][response_token_ids_start_idx:] + answer_attention_mask = full_tokenized["attention_mask"][response_token_ids_start_idx:] + + return dict( + prompt_input_ids=prompt_input_ids, + prompt_attention_mask=prompt_attention_mask, + input_ids=answer_input_ids, + attention_mask=answer_attention_mask, + ) + + def tokenize_row(self, feature, model: Optional[Union[PreTrainedModel, nn.Module]] = None) -> dict: + """Tokenize a single row from a ORPO specific dataset. + + At this stage, we don't convert to PyTorch tensors yet; we just handle the truncation in case the prompt + + chosen or prompt + rejected responses is/are too long. First we truncate the prompt; if we're still too long, + we truncate the chosen/rejected. + + We also create the labels for the chosen/rejected responses, which are of length equal to the sum of the length + of the prompt and the chosen/rejected response, with label_pad_token_id for the prompt tokens. + """ + batch = {} + prompt = feature["prompt"] + chosen = feature["chosen"] + rejected = feature["rejected"] + + if not self.is_encoder_decoder: + # Check issues below for more details + # 1. https://github.com/huggingface/trl/issues/907 + # 2. https://github.com/EleutherAI/lm-evaluation-harness/pull/531#issuecomment-1595586257 + # 3. https://github.com/LianjiaTech/BELLE/issues/337 + + if not isinstance(prompt, str): + raise ValueError(f"prompt should be an str but got {type(prompt)}") + prompt_tokens = self.processing_class(prompt, add_special_tokens=False) + prompt_tokens = {f"prompt_{k}": v for k, v in prompt_tokens.items()} + + if not isinstance(chosen, str): + raise ValueError(f"chosen should be an str but got {type(chosen)}") + chosen_tokens = self.build_tokenized_answer(prompt, chosen) + + if not isinstance(rejected, str): + raise ValueError(f"rejected should be an str but got {type(rejected)}") + rejected_tokens = self.build_tokenized_answer(prompt, rejected) + + # Last prompt token might get merged by tokenizer and + # it should not be included for generation if that happens + prompt_len_input_ids = len(prompt_tokens["prompt_input_ids"]) + + chosen_prompt_len_input_ids = len(chosen_tokens["prompt_input_ids"]) + rejected_prompt_len_input_ids = len(rejected_tokens["prompt_input_ids"]) + prompt_len_input_ids = min(chosen_prompt_len_input_ids, rejected_prompt_len_input_ids) + + for k, v in prompt_tokens.items(): + prompt_tokens[k] = v[:prompt_len_input_ids] + + # Make sure prompts only have one different token at most an + # and length only differs by 1 at most + num_diff_tokens = sum( + [a != b for a, b in zip(chosen_tokens["prompt_input_ids"], rejected_tokens["prompt_input_ids"])] + ) + num_diff_len = abs(chosen_prompt_len_input_ids - rejected_prompt_len_input_ids) + if num_diff_tokens > 1 or num_diff_len > 1: + raise ValueError( + "Chosen and rejected prompt_input_ids might only differ on the " + "last token due to tokenizer merge ops." + ) + + # add BOS token to head of prompt. Avoid adding if it's already there + prompt_tokens, chosen_tokens, rejected_tokens = add_bos_token_if_needed( + self.processing_class.bos_token_id, + prompt_len_input_ids, + prompt_tokens, + chosen_prompt_len_input_ids, + chosen_tokens, + rejected_prompt_len_input_ids, + rejected_tokens, + ) + + # add EOS token to end of answer. Avoid adding if it's already there + chosen_tokens, rejected_tokens = add_eos_token_if_needed( + self.processing_class.eos_token_id, chosen_tokens, rejected_tokens + ) + + longer_response_length = max(len(chosen_tokens["input_ids"]), len(rejected_tokens["input_ids"])) + + # if combined sequence is too long, truncate the prompt + for answer_tokens in [chosen_tokens, rejected_tokens, prompt_tokens]: + if len(answer_tokens["prompt_input_ids"]) + longer_response_length > self.max_length: + if self.truncation_mode == "keep_start": + for k in ["prompt_input_ids", "prompt_attention_mask"]: + answer_tokens[k] = answer_tokens[k][: self.max_prompt_length] + elif self.truncation_mode == "keep_end": + for k in ["prompt_input_ids", "prompt_attention_mask"]: + answer_tokens[k] = answer_tokens[k][-self.max_prompt_length :] + else: + raise ValueError(f"Unknown truncation mode: {self.truncation_mode}") + + # if that's still too long, truncate the response + for answer_tokens in [chosen_tokens, rejected_tokens]: + if len(answer_tokens["prompt_input_ids"]) + longer_response_length > self.max_length: + for k in ["input_ids", "attention_mask"]: + answer_tokens[k] = answer_tokens[k][: self.max_length - self.max_prompt_length] + + # Create labels + chosen_sequence_tokens = { + k: chosen_tokens[f"prompt_{k}"] + chosen_tokens[k] for k in ["input_ids", "attention_mask"] + } + rejected_sequence_tokens = { + k: rejected_tokens[f"prompt_{k}"] + rejected_tokens[k] for k in ["input_ids", "attention_mask"] + } + chosen_sequence_tokens["labels"] = chosen_sequence_tokens["input_ids"][:] + chosen_sequence_tokens["labels"][: len(chosen_tokens["prompt_input_ids"])] = [ + self.label_pad_token_id + ] * len(chosen_tokens["prompt_input_ids"]) + rejected_sequence_tokens["labels"] = rejected_sequence_tokens["input_ids"][:] + rejected_sequence_tokens["labels"][: len(rejected_tokens["prompt_input_ids"])] = [ + self.label_pad_token_id + ] * len(rejected_tokens["prompt_input_ids"]) + + for k, toks in { + "chosen_": chosen_sequence_tokens, + "rejected_": rejected_sequence_tokens, + "": prompt_tokens, + }.items(): + for type_key, tokens in toks.items(): + if type_key == "token_type_ids": + continue + batch[f"{k}{type_key}"] = tokens + + else: + chosen_tokens = self.processing_class( + chosen, truncation=True, max_length=self.max_completion_length, add_special_tokens=True + ) + rejected_tokens = self.processing_class( + rejected, truncation=True, max_length=self.max_completion_length, add_special_tokens=True + ) + prompt_tokens = self.processing_class( + prompt, truncation=True, max_length=self.max_prompt_length, add_special_tokens=True + ) + + batch["chosen_labels"] = chosen_tokens["input_ids"] + batch["rejected_labels"] = rejected_tokens["input_ids"] + batch["prompt_input_ids"] = prompt_tokens["input_ids"] + batch["prompt_attention_mask"] = prompt_tokens["attention_mask"] + + if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"): + batch["rejected_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels( + labels=torch.tensor(batch["rejected_labels"]) + ) + batch["chosen_decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels( + labels=torch.tensor(batch["chosen_labels"]) + ) + + if is_torch_xla_available(): + # Pad the sequences to global max_length to avoid TorchXLA recompilation + for k in batch: + if "labels" in k or self.is_encoder_decoder: + pad_value = self.label_pad_token_id + elif k.endswith("_input_ids"): + pad_value = self.padding_value + elif k.endswith("_attention_mask"): + pad_value = 0 + batch[k] = batch[k] + [pad_value] * (self.max_length - len(batch[k])) + return batch + + @staticmethod + def concatenated_inputs( + batch: dict[str, Union[list, torch.LongTensor]], + is_encoder_decoder: bool = False, + label_pad_token_id: int = -100, + padding_value: int = 0, + device: Optional[torch.device] = None, + ) -> dict[str, torch.LongTensor]: + """Concatenate the chosen and rejected inputs into a single tensor. + + Args: + batch: + A batch of data. Must contain the keys 'chosen_input_ids' and 'rejected_input_ids', which are tensors + of shape (batch_size, sequence_length). + is_encoder_decoder: + Whether the model is an encoder-decoder model. + label_pad_token_id: + The label pad token id. + padding_value: + The padding value to use for the concatenated inputs_ids. + device: + The device for the concatenated inputs. + + Returns: + A dictionary containing the concatenated inputs under the key 'concatenated_input_ids'. + """ + concatenated_batch = {} + + if is_encoder_decoder: + max_length = max(batch["chosen_labels"].shape[1], batch["rejected_labels"].shape[1]) + else: + max_length = max(batch["chosen_input_ids"].shape[1], batch["rejected_input_ids"].shape[1]) + + for k in batch: + if k.startswith("chosen") and isinstance(batch[k], torch.Tensor): + if "labels" in k or is_encoder_decoder: + pad_value = label_pad_token_id + elif k.endswith("_input_ids"): + pad_value = padding_value + elif k.endswith("_attention_mask"): + pad_value = 0 + concatenated_key = k.replace("chosen", "concatenated") + concatenated_batch[concatenated_key] = pad_to_length(batch[k], max_length, pad_value=pad_value) + for k in batch: + if k.startswith("rejected") and isinstance(batch[k], torch.Tensor): + if "labels" in k or is_encoder_decoder: + pad_value = label_pad_token_id + elif k.endswith("_input_ids"): + pad_value = padding_value + elif k.endswith("_attention_mask"): + pad_value = 0 + concatenated_key = k.replace("rejected", "concatenated") + concatenated_batch[concatenated_key] = torch.cat( + ( + concatenated_batch[concatenated_key], + pad_to_length(batch[k], max_length, pad_value=pad_value), + ), + dim=0, + ).to(device=device) + + if is_encoder_decoder: + concatenated_batch["concatenated_input_ids"] = batch["prompt_input_ids"].repeat(2, 1).to(device=device) + concatenated_batch["concatenated_attention_mask"] = ( + batch["prompt_attention_mask"].repeat(2, 1).to(device=device) + ) + + return concatenated_batch + + def odds_ratio_loss( + self, + policy_chosen_logps: torch.FloatTensor, + policy_rejected_logps: torch.FloatTensor, + ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: + """Compute ORPO's odds ratio (OR) loss for a batch of policy and reference model log probabilities. + + Args: + policy_chosen_logps: + Log probabilities of the policy model for the chosen responses. Shape: (batch_size,) + policy_rejected_logps: + Log probabilities of the policy model for the rejected responses. Shape: (batch_size,) + + Returns: + A tuple of three tensors: (losses, chosen_rewards, rejected_rewards). The losses tensor contains the ORPO + loss for each example in the batch. The chosen_rewards and rejected_rewards tensors contain the rewards for + the chosen and rejected responses, respectively. The log odds ratio of the chosen responses over the + rejected responses ratio for logging purposes. The `log(sigmoid(log_odds_chosen))` for logging purposes. + """ + + # Derived from Eqs. (4) and (7) from https://huggingface.co/papers/2403.07691 by using log identities and exp(log(P(y|x)) = P(y|x) + log_odds = (policy_chosen_logps - policy_rejected_logps) - ( + torch.log1p(-torch.exp(policy_chosen_logps)) - torch.log1p(-torch.exp(policy_rejected_logps)) + ) + ratio = F.logsigmoid(log_odds) + losses = self.beta * ratio + + chosen_rewards = self.beta * (policy_chosen_logps.to(self.accelerator.device)).detach() + rejected_rewards = self.beta * (policy_rejected_logps.to(self.accelerator.device)).detach() + + return losses, chosen_rewards, rejected_rewards, torch.mean(ratio), torch.mean(log_odds) + + @staticmethod + def get_batch_logps( + logits: torch.FloatTensor, + labels: torch.LongTensor, + average_log_prob: bool = False, + label_pad_token_id: int = -100, + is_encoder_decoder: bool = False, + ) -> torch.FloatTensor: + """Compute the log probabilities of the given labels under the given logits. + + Args: + logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size) + labels: + Labels for which to compute the log probabilities. Label tokens with a value of label_pad_token_id are + ignored. Shape: (batch_size, sequence_length) + average_log_prob: + If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the + log probabilities of the (non-masked) tokens. + label_pad_token_id: The label pad token id. + is_encoder_decoder: Whether the model is an encoder-decoder model. + + Returns: + A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the + given logits. + """ + if logits.shape[:-1] != labels.shape: + raise ValueError("Logits (batch and sequence length dim) and labels must have the same shape.") + + if not is_encoder_decoder: + labels = labels[:, 1:].clone() + logits = logits[:, :-1, :] + loss_mask = labels != label_pad_token_id + + # dummy token; we'll ignore the losses on these tokens later + labels = torch.where(labels == label_pad_token_id, 0, labels) + + per_token_logps = selective_log_softmax(logits, labels) + + if average_log_prob: + return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1) + else: + return (per_token_logps * loss_mask).sum(-1) + + def concatenated_forward( + self, model: nn.Module, batch: dict[str, Union[list, torch.LongTensor]] + ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: + """Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together. + + We do this to avoid doing two forward passes, because it's faster for FSDP. + """ + concatenated_batch = self.concatenated_inputs( + batch, + is_encoder_decoder=self.is_encoder_decoder, + label_pad_token_id=self.label_pad_token_id, + padding_value=self.padding_value, + device=self.accelerator.device, + ) + len_chosen = batch["chosen_labels"].shape[0] + + model_kwargs = ( + { + "decoder_input_ids": self._shift_right(concatenated_batch["concatenated_labels"]), + } + if self.is_encoder_decoder + else {} + ) + + if self.aux_loss_enabled: + model_kwargs["output_router_logits"] = True + + outputs = model( + concatenated_batch["concatenated_input_ids"], + attention_mask=concatenated_batch["concatenated_attention_mask"], + use_cache=False, + **model_kwargs, + ) + all_logits = outputs.logits + + def cross_entropy_loss(logits, labels): + if not self.is_encoder_decoder: + # Shift so that tokens < n predict n + logits = logits[..., :-1, :].contiguous() + labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = nn.CrossEntropyLoss() + logits = logits.view(-1, logits.shape[-1]) + labels = labels.view(-1) + # Enable model parallelism + labels = labels.to(logits.device) + loss = loss_fct(logits, labels) + return loss + + if self.is_encoder_decoder: + labels = concatenated_batch["concatenated_labels"].clone() + else: + labels = concatenated_batch["concatenated_input_ids"].clone() + attention_mask = concatenated_batch["concatenated_attention_mask"] + labels = torch.where(attention_mask == 1, labels, self.label_pad_token_id) + # orpo chosen nll loss is computed over the full prompt and response + chosen_nll_loss = cross_entropy_loss(all_logits[:len_chosen], labels[:len_chosen]) + + all_logps = self.get_batch_logps( + all_logits, + concatenated_batch["concatenated_labels"], + average_log_prob=True, + is_encoder_decoder=self.is_encoder_decoder, + label_pad_token_id=self.label_pad_token_id, + ) + + chosen_logps = all_logps[:len_chosen] + rejected_logps = all_logps[len_chosen:] + + if not self.is_encoder_decoder: + chosen_logits = all_logits[:len_chosen, :-1, :] + rejected_logits = all_logits[len_chosen:, :-1, :] + else: + chosen_logits = all_logits[:len_chosen] + rejected_logits = all_logits[len_chosen:] + + if self.aux_loss_enabled: + return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_nll_loss, outputs.aux_loss) + + return (chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_nll_loss) + + def get_batch_loss_metrics( + self, + model, + batch: dict[str, Union[list, torch.LongTensor]], + train_eval: Literal["train", "eval"] = "train", + ): + """Compute the ORPO loss and other metrics for the given batch of inputs for train or test.""" + metrics = {} + + forward_output = self.concatenated_forward(model, batch) + ( + policy_chosen_logps, + policy_rejected_logps, + policy_chosen_logits, + policy_rejected_logits, + policy_nll_loss, + ) = forward_output[:5] + if self.aux_loss_enabled: + aux_loss = forward_output[5] + + losses, chosen_rewards, rejected_rewards, log_odds_ratio, log_odds_chosen = self.odds_ratio_loss( + policy_chosen_logps, policy_rejected_logps + ) + # full ORPO loss + loss = policy_nll_loss - losses.mean() + + reward_accuracies = (chosen_rewards > rejected_rewards).float() + + prefix = "eval_" if train_eval == "eval" else "" + metrics[f"{prefix}rewards/chosen"] = self.accelerator.gather_for_metrics(chosen_rewards).mean() + metrics[f"{prefix}rewards/rejected"] = self.accelerator.gather_for_metrics(rejected_rewards).mean() + metrics[f"{prefix}rewards/accuracies"] = self.accelerator.gather_for_metrics(reward_accuracies).mean() + metrics[f"{prefix}rewards/margins"] = self.accelerator.gather_for_metrics( + chosen_rewards - rejected_rewards + ).mean() + metrics[f"{prefix}logps/rejected"] = self.accelerator.gather_for_metrics(policy_rejected_logps).detach().mean() + metrics[f"{prefix}logps/chosen"] = self.accelerator.gather_for_metrics(policy_chosen_logps).detach().mean() + metrics[f"{prefix}logits/rejected"] = self.accelerator.gather_for_metrics( + policy_rejected_logits.detach().mean() + ).mean() + metrics[f"{prefix}logits/chosen"] = self.accelerator.gather_for_metrics( + policy_chosen_logits.detach().mean() + ).mean() + metrics[f"{prefix}nll_loss"] = self.accelerator.gather_for_metrics(policy_nll_loss).detach().mean() + metrics[f"{prefix}log_odds_ratio"] = self.accelerator.gather_for_metrics(log_odds_ratio).detach().mean() + metrics[f"{prefix}log_odds_chosen"] = self.accelerator.gather_for_metrics(log_odds_chosen).detach().mean() + if is_torch_xla_available(): + xm.mark_step() # needed because .item() calls + for k, v in metrics.items(): + metrics[k] = v.item() + if self.aux_loss_enabled: + loss += self.aux_loss_coef * aux_loss + + return loss, metrics + + def compute_loss( + self, + model: Union[PreTrainedModel, nn.Module], + inputs: dict[str, Union[torch.Tensor, Any]], + return_outputs=False, + num_items_in_batch=None, + ) -> Union[torch.Tensor, tuple[torch.Tensor, dict[str, torch.Tensor]]]: + compute_loss_context_manager = ( + autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext() + ) + + with compute_loss_context_manager: + loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="train") + + # Make sure to move the loss to the device the original accumulating loss is at back in the `Trainer` class: + loss = loss.to(self.args.device) + + # force log the metrics + self.store_metrics(metrics, train_eval="train") + + if return_outputs: + return (loss, metrics) + return loss + + def generate_from_model(self, model, batch: dict[str, torch.LongTensor]) -> str: + """Generate samples from the model and reference model for the given batch of inputs.""" + + # If one uses `generate_during_eval` with peft + bf16, we need to explicitly call generate with + # the torch amp context manager as some hidden states are silently casted to full precision. + generate_context_manager = ( + autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext() + ) + + with generate_context_manager: + policy_output = model.generate( + input_ids=batch["prompt_input_ids"], + attention_mask=batch["prompt_attention_mask"], + max_length=self.max_length, + do_sample=True, + pad_token_id=self.processing_class.pad_token_id, + ) + + policy_output = pad_to_length(policy_output, self.max_length, self.processing_class.pad_token_id) + policy_output_decoded = self.processing_class.batch_decode(policy_output, skip_special_tokens=True) + + return policy_output_decoded + + def prediction_step( + self, + model: Union[PreTrainedModel, nn.Module], + inputs: dict[str, Union[torch.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[list[str]] = None, + ): + if not self.use_dpo_data_collator: + warnings.warn( + "prediction_step is only implemented for DPODataCollatorWithPadding, and you passed a datacollator that is different than " + "DPODataCollatorWithPadding - you might see unexpected behavior. Alternatively, you can implement your own prediction_step method if you are using a custom data collator" + ) + if ignore_keys is None: + if hasattr(model, "config"): + ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", []) + else: + ignore_keys = [] + + prediction_context_manager = ( + autocast(self.accelerator.device.type) if self._peft_has_been_casted_to_bf16 else nullcontext() + ) + + with torch.no_grad(), prediction_context_manager: + loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="eval") + + # force log the metrics + self.store_metrics(metrics, train_eval="eval") + + if prediction_loss_only: + return (loss.detach(), None, None) + + # logits for the chosen and rejected samples from model + logits_dict = { + "eval_logits/chosen": metrics["eval_logits/chosen"], + "eval_logits/rejected": metrics["eval_logits/rejected"], + } + logits = [v for k, v in logits_dict.items() if k not in ignore_keys] + logits = torch.tensor(logits, device=self.accelerator.device) + labels = torch.zeros(logits.shape[0], device=self.accelerator.device) + + return (loss.detach(), logits, labels) + + def store_metrics(self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train") -> None: + for key, value in metrics.items(): + self._stored_metrics[train_eval][key].append(value) + + def evaluation_loop( + self, + dataloader: DataLoader, + description: str, + prediction_loss_only: Optional[bool] = None, + ignore_keys: Optional[list[str]] = None, + metric_key_prefix: str = "eval", + ) -> EvalLoopOutput: + """ + Overriding built-in evaluation loop to store metrics for each batch. Prediction/evaluation loop, shared by + `Trainer.evaluate()` and `Trainer.predict()`. + + Works both with or without labels. + """ + + # Sample and save to game log if requested (for one batch to save time) + if self.generate_during_eval: + # Generate random indices within the range of the total number of samples + num_samples = len(dataloader.dataset) + random_indices = random.sample(range(num_samples), k=self.args.eval_batch_size) + + # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader + random_batch_dataset = dataloader.dataset.select(random_indices) + random_batch = self.data_collator(random_batch_dataset) + random_batch = self._prepare_inputs(random_batch) + + policy_output_decoded = self.generate_from_model(self.model, random_batch) + + table = pd.DataFrame( + columns=["Prompt", "Policy"], + data=[ + [prompt, pol[len(prompt) :]] for prompt, pol in zip(random_batch["prompt"], policy_output_decoded) + ], + ) + if "wandb" in self.args.report_to: + wandb.log({"game_log": wandb.Table(data=table)}) + + if "comet_ml" in self.args.report_to: + log_table_to_comet_experiment( + name="game_log.csv", + table=table, + ) + + # Base evaluation + initial_output = super().evaluation_loop( + dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix + ) + + return initial_output + + def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None: + """ + Log `logs` on the various objects watching training, including stored metrics. + + Args: + logs (`dict[str, float]`): + The values to log. + start_time (`float` or `None`, *optional*, defaults to `None`): + Start time of the training. + """ + # logs either has 'loss' or 'eval_loss' + train_eval = "train" if "loss" in logs else "eval" + # Add averaged stored metrics to logs + for key, metrics in self._stored_metrics[train_eval].items(): + logs[key] = torch.tensor(metrics).mean().item() + del self._stored_metrics[train_eval] + return super().log(logs, start_time) + + def _shift_right(self, input_ids): + if self.decoder_start_token_id is None: + raise ValueError( + "model.config.decoder_start_token_id has to be defined. It is usually set to the pad_token_id." + ) + + # shift inputs to the right + if is_torch_fx_proxy(input_ids): + # Item assignment is not supported natively for proxies. + shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), self.decoder_start_token_id) + shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1) + else: + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[..., 1:] = input_ids[..., :-1].clone() + shifted_input_ids[..., 0] = self.decoder_start_token_id + + if self.pad_token_id is None: + raise ValueError("model.config.pad_token_id has to be defined.") + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, self.pad_token_id) + + return shifted_input_ids + + # Ensure the model card is saved along with the checkpoint + def _save_checkpoint(self, model, trial): + if self.args.hub_model_id is None: + model_name = Path(self.args.output_dir).name + else: + model_name = self.args.hub_model_id.split("/")[-1] + self.create_model_card(model_name=model_name) + super()._save_checkpoint(model, trial) + + def create_model_card( + self, + model_name: Optional[str] = None, + dataset_name: Optional[str] = None, + tags: Union[str, list[str], None] = None, + ): + """ + Creates a draft of a model card using the information available to the `Trainer`. + + Args: + model_name (`str` or `None`, *optional*, defaults to `None`): + Name of the model. + dataset_name (`str` or `None`, *optional*, defaults to `None`): + Name of the dataset used for training. + tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`): + Tags to be associated with the model card. + """ + if not self.is_world_process_zero(): + return + + if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path): + base_model = self.model.config._name_or_path + else: + base_model = None + + # normalize `tags` to a mutable set + if tags is None: + tags = set() + elif isinstance(tags, str): + tags = {tags} + else: + tags = set(tags) + + if hasattr(self.model.config, "unsloth_version"): + tags.add("unsloth") + + tags.update(self._tag_names) + + citation = textwrap.dedent("""\ + @article{hong2024orpo, + title = {{ORPO: Monolithic Preference Optimization without Reference Model}}, + author = {Jiwoo Hong and Noah Lee and James Thorne}, + year = 2024, + eprint = {arXiv:2403.07691} + }""") + + model_card = generate_model_card( + base_model=base_model, + model_name=model_name, + hub_model_id=self.hub_model_id, + dataset_name=dataset_name, + tags=tags, + wandb_url=wandb.run.get_url() if is_wandb_available() and wandb.run is not None else None, + comet_url=get_comet_experiment_url(), + trainer_name="ORPO", + trainer_citation=citation, + paper_title="ORPO: Monolithic Preference Optimization without Reference Model", + paper_id="2403.07691", + ) + + model_card.save(os.path.join(self.args.output_dir, "README.md")) +class UnslothORPOTrainer(_UnslothORPOTrainer): + """ + + Initialize ORPOTrainer. + + Args: + model (`transformers.PreTrainedModel`): + The model to train, preferably an `AutoModelForSequenceClassification`. + args (`ORPOConfig`): + The ORPO config arguments to use for training. + data_collator (`transformers.DataCollator`): + The data collator to use for training. If None is specified, the default data collator + (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the + sequences in the batch, given a dataset of paired sequences. + train_dataset (`datasets.Dataset`): + The dataset to use for training. + eval_dataset (`datasets.Dataset`): + The dataset to use for evaluation. + processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*): + Processing class used to process the data. If provided, will be used to automatically process the inputs + for the model, and it will be saved along the model to make it easier to rerun an interrupted training or + reuse the fine-tuned model. + model_init (`Callable[[], transformers.PreTrainedModel]`): + The model initializer to use for training. If None is specified, the default model initializer will be + used. + callbacks (`list[transformers.TrainerCallback]`): + The callbacks to use for training. + optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`): + The optimizer and scheduler to use for training. + preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`): + The function to use to preprocess the logits before computing the metrics. + peft_config (`dict`, defaults to `None`): + The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in + a PEFT model. + compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*): + The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to + metric values. + + """ + def __init__( + self, + model = None, + args = None, + data_collator = None, + train_dataset = None, + eval_dataset = None, + processing_class = None, + model_init = None, + callbacks = None, + preprocess_logits_for_metrics = None, + peft_config = None, + compute_metrics = None, + **kwargs + ): + if args is None: args = UnslothORPOConfig() + use_bf16 = getattr(args, 'bf16', False) + if type(use_bf16) is not bool: use_bf16 = False + use_fp16 = getattr(args, 'fp16', False) + if type(use_fp16) is not bool: use_fp16 = False + force_float32 = False + if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1': + print('Unsloth: Switching to float32 training since model cannot work with float16') + force_float32 = True + mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') + dtype = getattr(model.config, 'torch_dtype', None) + if dtype is None: dtype = model.get_input_embeddings().dtype + from unsloth_zoo.utils import _get_dtype + dtype = _get_dtype(dtype) + float16 = dtype == torch.float16 + if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`') + if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`') + if force_float32: + args.fp16 = False + args.bf16 = False + os.environ['ACCELERATE_MIXED_PRECISION'] = 'no' + elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32': + args.fp16 = float16 + args.bf16 = not float16 + os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16' + if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no': + args.eval_strategy = 'steps' + if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1 + ga_steps = getattr(args, 'gradient_accumulation_steps', None) + if ga_steps is not None and ga_steps > 1: + from transformers import __version__ as transformers_version + if Version(transformers_version) <= Version('4.45.2'): + print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n' + '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`') + if getattr(args, 'eval_strategy', 'no') != 'no': + eval_bsz = getattr(args, 'per_device_eval_batch_size', 8) + if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size + if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps + fp16_full_eval = getattr(args, 'fp16_full_eval', False) + if type(fp16_full_eval) is not bool: fp16_full_eval = False + bf16_full_eval = getattr(args, 'bf16_full_eval', False) + if type(bf16_full_eval) is not bool: bf16_full_eval = False + if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True + if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False + if force_float32: + args.bf16_full_eval = False + args.fp16_full_eval = False + elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16': + args.bf16_full_eval = True + args.fp16_full_eval = False + elif not bf16_full_eval and not fp16_full_eval: + args.bf16_full_eval = args.bf16 + args.fp16_full_eval = args.fp16 + _output_logits = False + if locals().get('compute_metrics', None) is not None: _output_logits = True + if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True + if _output_logits: + os.environ['UNSLOTH_RETURN_LOGITS'] = '1' + if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'): + pass + else: + model_max_seq_length = getattr(model, 'max_seq_length', None) + args_max_seq_length = getattr(args, 'max_seq_length', None) + if args_max_seq_length is None and model_max_seq_length is not None: + max_seq_length = model.max_seq_length + if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length + if model is not None and hasattr(model, 'for_training'): + model.for_training() + if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right' + if 'processing_class' in locals(): + if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right' + if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right' + __tokenizer = processing_class if 'processing_class' in locals() else tokenizer + from unsloth_zoo.vision_utils import UnslothVisionDataCollator + if not isinstance(data_collator, UnslothVisionDataCollator): + if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer, mlm = False, mlm_probability = 0.0) + elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names: + data_collator = DataCollatorForSeq2Seq(__tokenizer) + else: + if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False + if hasattr(args, 'dataset_text_field'): args.dataset_text_field = '' + if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True} + if not isinstance(data_collator, UnslothVisionDataCollator): + if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'): + if isinstance(data_collator, DataCollatorForSeq2Seq): + data_collator = DataCollatorForSeq2Seq(__tokenizer.tokenizer) + else: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer.tokenizer, mlm = False, mlm_probability = 0.0) + other_metrics = [] + + from unsloth_zoo.logging_utils import PatchRLStatistics + PatchRLStatistics('orpo_trainer', other_metrics) + + super().__init__( + model = model, + args = args, + data_collator = data_collator, + train_dataset = train_dataset, + eval_dataset = eval_dataset, + processing_class = processing_class, + model_init = model_init, + callbacks = callbacks, + preprocess_logits_for_metrics = preprocess_logits_for_metrics, + peft_config = peft_config, + compute_metrics = compute_metrics,**kwargs) + if hasattr(self, 'neftune_hook_handle'): + self.neftune_hook_handle.remove() + if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle + if getattr(args, 'neftune_noise_alpha', None) is not None: + model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha + pass + +pass diff --git a/unsloth_compiled_cache/UnslothOnlineDPOTrainer.py b/unsloth_compiled_cache/UnslothOnlineDPOTrainer.py new file mode 100644 index 0000000000000000000000000000000000000000..c8fbf506dd19bf7448df3b45c1221dc65530c3ef --- /dev/null +++ b/unsloth_compiled_cache/UnslothOnlineDPOTrainer.py @@ -0,0 +1,1260 @@ +""" +2025.7.4 +2025.7.3 +4.53.2 +0.19.1 +__UNSLOTH_VERSIONING__ +""" +from torch import Tensor +import torch +import torch.nn as nn +from torch.nn import functional as F +from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable +from trl.trainer.online_dpo_trainer import (Any, BaseImageProcessor, BasePairwiseJudge, Callable, DPODataCollatorWithPadding, DataCollator, DataLoader, Dataset, EvalPrediction, F, FeatureExtractionMixin, GenerationConfig, IterableDataset, OnlineDPOConfig, OnlineDPOTrainer, OptimizerNames, Optional, Path, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, SIMPLE_CHAT_TEMPLATE, Trainer, TrainerCallback, Union, apply_chat_template, create_reference_model, datasets, disable_dropout_in_model, empty_cache, generate_model_card, get_comet_experiment_url, get_reward, is_conversational, is_peft_available, is_wandb_available, jinja2, logging, maybe_apply_chat_template, nn, os, prepare_deepspeed, seed_worker, textwrap, torch, truncate_right, unwrap_model_for_generation, version, wandb, warnings, wraps, F, is_conversational, os, torch, F, Optional, PeftModel, PreTrainedModel, Trainer, is_peft_available, os, torch) + + +import os +from typing import * +from dataclasses import dataclass, field +from packaging.version import Version +import torch +import numpy as np +from contextlib import nullcontext +from torch.nn import functional as F +from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling + +torch_compile_options = { + "epilogue_fusion" : True, + "max_autotune" : False, + "shape_padding" : True, + "trace.enabled" : False, + "triton.cudagraphs" : False, +} + +@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) +def selective_log_softmax(logits, index): + logits = logits.to(torch.float32) + selected_logits = torch.gather(logits, dim = -1, index = index.unsqueeze(-1)).squeeze(-1) + # loop to reduce peak mem consumption + # logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits]) + logsumexp_values = torch.logsumexp(logits, dim = -1) + per_token_logps = selected_logits - logsumexp_values # log_softmax(x_i) = x_i - logsumexp(x) + return per_token_logps +def vLLMSamplingParams(**kwargs): + from vllm import SamplingParams + sampling_params = SamplingParams(**kwargs) + sampling_params._set_kwargs = kwargs + return sampling_params +@dataclass +class UnslothOnlineDPOConfig(OnlineDPOConfig): + """ + + Configuration class for the [`OnlineDPOTrainer`]. + + This class includes only the parameters that are specific to Online DPO training. For a full list of training + arguments, please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this + class may differ from those in [`~transformers.TrainingArguments`]. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + reward_model_path (`str` or `None`, *optional*, defaults to `None`): + Path to the reward model. Either `judge` or `reward_model_path` must be set, but not both. + judge (`str` or `None`, *optional*, defaults to `None`): + Name of the judge to use. Either `judge` or `reward_model_path` must be set, but not both. + max_new_tokens (`int`, *optional*, defaults to `64`): + Maximum number of tokens to generate per completion. + max_length (`int`, *optional*, defaults to `256`): + Maximum total length of the sequence (prompt + completion) used to compute log probabilities. If the + sequence exceeds this limit, the leftmost tokens will be truncated to preserve as much of the completion as + possible. + temperature (`float`, *optional*, defaults to `0.9`): + Temperature for sampling. The higher the temperature, the more random the completions. + missing_eos_penalty (`float` or `None`, *optional*, defaults to `None`): + Penalty applied to the score when the model fails to generate an EOS token. This is useful to encourage to + generate completions shorter than the maximum length (`max_new_tokens`). The penalty must be a positive + value. + beta (`float` or `list[float]`, *optional*, defaults to `0.1`): + Parameter controlling the deviation from the reference model. Higher β means less deviation from the + reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in + the [paper](https://huggingface.co/papers/2310.12036). If a list of floats is provided then the β is + selected for each new epoch and the last β is used for the rest of the epochs. + loss_type (`str`, *optional*, defaults to `"sigmoid"`): + Type of loss to use. Possible values are: + + - `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper. + - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper. + + dataset_num_proc (`int` or `None`, *optional*, defaults to `None`): + Number of processes to use for processing the dataset. + disable_dropout (`bool`, *optional*, defaults to `True`): + Whether to disable dropout in the model and reference model. + use_vllm (`bool`, *optional*, defaults to `False`): + Whether to use vLLM for generating completions. Requires vLLM to be installed (`pip install vllm`). + gpu_memory_utilization (`float`, *optional*, defaults to `0.55`): + The vLLM memory utilization. The default value is 0.55. + ds3_gather_for_generation (`bool`, *optional*, defaults to `True`): + This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation, + improving generation speed. However, disabling this option allows training models that exceed the VRAM + capacity of a single GPU, albeit at the cost of slower generation. + + """ + vllm_sampling_params: Optional[Any] = field( + default = None, + metadata = {'help': 'vLLM SamplingParams'}, + ) + unsloth_num_chunks : Optional[int] = field( + default = -1, + metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'}, + ) + def __init__( + self, + output_dir = None, + overwrite_output_dir = None, + do_train = False, + do_eval = False, + do_predict = False, + eval_strategy = 'no', + prediction_loss_only = False, + per_device_train_batch_size = 4, + per_device_eval_batch_size = 4, + per_gpu_train_batch_size = None, + per_gpu_eval_batch_size = None, + gradient_accumulation_steps = 2, + eval_accumulation_steps = 2, + eval_delay = 0, + torch_empty_cache_steps = 250, + learning_rate = 5e-05, + weight_decay = 0.01, + adam_beta1 = 0.9, + adam_beta2 = 0.999, + adam_epsilon = 1e-08, + max_grad_norm = 1.0, + num_train_epochs = 3.0, + max_steps = -1, + lr_scheduler_type = 'linear', + warmup_ratio = 0.1, + warmup_steps = 0, + log_level = 'passive', + log_level_replica = 'warning', + log_on_each_node = True, + logging_dir = None, + logging_strategy = 'steps', + logging_first_step = False, + logging_steps = 1, + logging_nan_inf_filter = False, + save_strategy = 'steps', + save_steps = 500, + save_total_limit = None, + save_safetensors = True, + save_on_each_node = False, + save_only_model = False, + restore_callback_states_from_checkpoint = False, + no_cuda = False, + use_cpu = False, + use_mps_device = False, + seed = 3407, + data_seed = 3407, + jit_mode_eval = False, + use_ipex = False, + bf16 = False, + fp16 = False, + fp16_opt_level = 'O1', + half_precision_backend = 'auto', + bf16_full_eval = False, + fp16_full_eval = False, + tf32 = None, + local_rank = -1, + ddp_backend = None, + tpu_num_cores = None, + tpu_metrics_debug = False, + debug = '', + dataloader_drop_last = False, + eval_steps = None, + dataloader_num_workers = 0, + dataloader_prefetch_factor = None, + past_index = -1, + run_name = None, + disable_tqdm = None, + remove_unused_columns = True, + label_names = None, + load_best_model_at_end = False, + metric_for_best_model = None, + greater_is_better = None, + ignore_data_skip = False, + fsdp = '', + fsdp_min_num_params = 0, + fsdp_config = None, + fsdp_transformer_layer_cls_to_wrap = None, + accelerator_config = None, + deepspeed = None, + label_smoothing_factor = 0.0, + optim = 'adamw_8bit', + optim_args = None, + adafactor = False, + group_by_length = False, + length_column_name = 'length', + report_to = None, + ddp_find_unused_parameters = None, + ddp_bucket_cap_mb = None, + ddp_broadcast_buffers = None, + dataloader_pin_memory = True, + dataloader_persistent_workers = False, + skip_memory_metrics = True, + use_legacy_prediction_loop = False, + push_to_hub = False, + resume_from_checkpoint = None, + hub_model_id = None, + hub_strategy = 'every_save', + hub_token = None, + hub_private_repo = None, + hub_always_push = False, + hub_revision = None, + gradient_checkpointing = False, + gradient_checkpointing_kwargs = None, + include_inputs_for_metrics = False, + eval_do_concat_batches = True, + fp16_backend = 'auto', + push_to_hub_model_id = None, + push_to_hub_organization = None, + push_to_hub_token = None, + mp_parameters = '', + auto_find_batch_size = False, + full_determinism = False, + torchdynamo = None, + ray_scope = 'last', + ddp_timeout = 1800, + torch_compile = False, + torch_compile_backend = None, + torch_compile_mode = None, + include_tokens_per_second = False, + include_num_input_tokens_seen = False, + neftune_noise_alpha = None, + optim_target_modules = None, + batch_eval_metrics = False, + eval_on_start = False, + use_liger_kernel = False, + liger_kernel_config = None, + eval_use_gather_object = False, + average_tokens_across_devices = False, + reward_model_path = None, + judge = None, + max_new_tokens = 64, + max_length = 512, + temperature = 0.9, + missing_eos_penalty = None, + loss_type = 'sigmoid', + dataset_num_proc = None, + disable_dropout = True, + use_vllm = False, + gpu_memory_utilization = 0.55, + ds3_gather_for_generation = True, + vllm_sampling_params = None, + unsloth_num_chunks = -1, + **kwargs, + ): + if learning_rate < 1e-7: raise FloatingPointError(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!') + if learning_rate > 1: raise OverflowError(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!') + if output_dir is None and save_strategy == 'steps' and save_steps == 500: + output_dir = 'unsloth_training_checkpoints' + save_strategy = 'no' + if dataset_num_proc is None: + from multiprocessing import cpu_count + dataset_num_proc = cpu_count() + if temperature <= 0: + raise MathError('Unsloth: Please set a positive non-zero temperature since your results will be wrong.') + elif temperature >= 10: + raise MathError('Unsloth: Please set a positive non-zero temperature less than 10, since sampling will be quite erratic.') + + + super().__init__( + output_dir = output_dir, + overwrite_output_dir = overwrite_output_dir, + do_train = do_train, + do_eval = do_eval, + do_predict = do_predict, + eval_strategy = eval_strategy, + prediction_loss_only = prediction_loss_only, + per_device_train_batch_size = per_device_train_batch_size, + per_device_eval_batch_size = per_device_eval_batch_size, + per_gpu_train_batch_size = per_gpu_train_batch_size, + per_gpu_eval_batch_size = per_gpu_eval_batch_size, + gradient_accumulation_steps = gradient_accumulation_steps, + eval_accumulation_steps = eval_accumulation_steps, + eval_delay = eval_delay, + torch_empty_cache_steps = torch_empty_cache_steps, + learning_rate = learning_rate, + weight_decay = weight_decay, + adam_beta1 = adam_beta1, + adam_beta2 = adam_beta2, + adam_epsilon = adam_epsilon, + max_grad_norm = max_grad_norm, + num_train_epochs = num_train_epochs, + max_steps = max_steps, + lr_scheduler_type = lr_scheduler_type, + warmup_ratio = warmup_ratio, + warmup_steps = warmup_steps, + log_level = log_level, + log_level_replica = log_level_replica, + log_on_each_node = log_on_each_node, + logging_dir = logging_dir, + logging_strategy = logging_strategy, + logging_first_step = logging_first_step, + logging_steps = logging_steps, + logging_nan_inf_filter = logging_nan_inf_filter, + save_strategy = save_strategy, + save_steps = save_steps, + save_total_limit = save_total_limit, + save_safetensors = save_safetensors, + save_on_each_node = save_on_each_node, + save_only_model = save_only_model, + restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint, + no_cuda = no_cuda, + use_cpu = use_cpu, + use_mps_device = use_mps_device, + seed = seed, + data_seed = data_seed, + jit_mode_eval = jit_mode_eval, + use_ipex = use_ipex, + bf16 = bf16, + fp16 = fp16, + fp16_opt_level = fp16_opt_level, + half_precision_backend = half_precision_backend, + bf16_full_eval = bf16_full_eval, + fp16_full_eval = fp16_full_eval, + tf32 = tf32, + local_rank = local_rank, + ddp_backend = ddp_backend, + tpu_num_cores = tpu_num_cores, + tpu_metrics_debug = tpu_metrics_debug, + debug = debug, + dataloader_drop_last = dataloader_drop_last, + eval_steps = eval_steps, + dataloader_num_workers = dataloader_num_workers, + dataloader_prefetch_factor = dataloader_prefetch_factor, + past_index = past_index, + run_name = run_name, + disable_tqdm = disable_tqdm, + remove_unused_columns = remove_unused_columns, + label_names = label_names, + load_best_model_at_end = load_best_model_at_end, + metric_for_best_model = metric_for_best_model, + greater_is_better = greater_is_better, + ignore_data_skip = ignore_data_skip, + fsdp = fsdp, + fsdp_min_num_params = fsdp_min_num_params, + fsdp_config = fsdp_config, + fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap, + accelerator_config = accelerator_config, + deepspeed = deepspeed, + label_smoothing_factor = label_smoothing_factor, + optim = optim, + optim_args = optim_args, + adafactor = adafactor, + group_by_length = group_by_length, + length_column_name = length_column_name, + report_to = report_to, + ddp_find_unused_parameters = ddp_find_unused_parameters, + ddp_bucket_cap_mb = ddp_bucket_cap_mb, + ddp_broadcast_buffers = ddp_broadcast_buffers, + dataloader_pin_memory = dataloader_pin_memory, + dataloader_persistent_workers = dataloader_persistent_workers, + skip_memory_metrics = skip_memory_metrics, + use_legacy_prediction_loop = use_legacy_prediction_loop, + push_to_hub = push_to_hub, + resume_from_checkpoint = resume_from_checkpoint, + hub_model_id = hub_model_id, + hub_strategy = hub_strategy, + hub_token = hub_token, + hub_private_repo = hub_private_repo, + hub_always_push = hub_always_push, + hub_revision = hub_revision, + gradient_checkpointing = gradient_checkpointing, + gradient_checkpointing_kwargs = gradient_checkpointing_kwargs, + include_inputs_for_metrics = include_inputs_for_metrics, + eval_do_concat_batches = eval_do_concat_batches, + fp16_backend = fp16_backend, + push_to_hub_model_id = push_to_hub_model_id, + push_to_hub_organization = push_to_hub_organization, + push_to_hub_token = push_to_hub_token, + mp_parameters = mp_parameters, + auto_find_batch_size = auto_find_batch_size, + full_determinism = full_determinism, + torchdynamo = torchdynamo, + ray_scope = ray_scope, + ddp_timeout = ddp_timeout, + torch_compile = torch_compile, + torch_compile_backend = torch_compile_backend, + torch_compile_mode = torch_compile_mode, + include_tokens_per_second = include_tokens_per_second, + include_num_input_tokens_seen = include_num_input_tokens_seen, + neftune_noise_alpha = neftune_noise_alpha, + optim_target_modules = optim_target_modules, + batch_eval_metrics = batch_eval_metrics, + eval_on_start = eval_on_start, + use_liger_kernel = use_liger_kernel, + liger_kernel_config = liger_kernel_config, + eval_use_gather_object = eval_use_gather_object, + average_tokens_across_devices = average_tokens_across_devices, + reward_model_path = reward_model_path, + judge = judge, + max_new_tokens = max_new_tokens, + max_length = max_length, + temperature = temperature, + missing_eos_penalty = missing_eos_penalty, + loss_type = loss_type, + dataset_num_proc = dataset_num_proc, + disable_dropout = disable_dropout, + use_vllm = use_vllm, + gpu_memory_utilization = gpu_memory_utilization, + ds3_gather_for_generation = ds3_gather_for_generation,**kwargs) + self.vllm_sampling_params = vllm_sampling_params + self.unsloth_num_chunks = unsloth_num_chunks +pass + +class _UnslothOnlineDPOTrainer(Trainer): + r"""""" + + _tag_names = ["trl", "online-dpo"] + + def __init__( + self, + model: Union[PreTrainedModel, nn.Module], + ref_model: Union[PreTrainedModel, nn.Module, None] = None, + reward_model: Union[PreTrainedModel, nn.Module, None] = None, + judge: Optional[BasePairwiseJudge] = None, + args: Optional[OnlineDPOConfig] = None, + data_collator: Optional[DataCollator] = None, + train_dataset: Optional[Union[Dataset, IterableDataset, "datasets.Dataset"]] = None, + eval_dataset: Optional[Union[Dataset, dict[str, Dataset], "datasets.Dataset"]] = None, + processing_class: Optional[ + Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin] + ] = None, + reward_processing_class: Optional[PreTrainedTokenizerBase] = None, + peft_config: Optional[dict] = None, + compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None, + callbacks: Optional[list[TrainerCallback]] = None, + optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), + preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, + ) -> None: + + if hasattr(model, 'vllm_engine') and hasattr(args, 'use_vllm'): + if (getattr(args, 'use_vllm', False) == False): + args.use_vllm = True + if ref_model is model: + raise ValueError( + "`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the " + "same as `model`, either omit the `ref_model` argument or pass `None`." + ) + + self.ref_model = ref_model + + if reward_model is not None and judge is not None: + warnings.warn( + "Both `reward_model` and `judge` are provided. Please choose provide only one of them. " + "Ignoring `judge` and using `reward_model`.", + UserWarning, + ) + judge = None + elif reward_model is None and judge is None: + raise ValueError("Either `reward_model` or `judge` must be provided.") + + self.reward_model = reward_model + self.reward_processing_class = reward_processing_class + self.judge = judge + self.is_encoder_decoder = model.config.is_encoder_decoder + + if args.missing_eos_penalty is not None and judge is not None: + raise ValueError("`missing_eos_penalty` is not supported when `judge` is provided.") + + if args is None: + raise ValueError("`args` must be provided.") + + # Check that the processing_class is provided + if processing_class is None: + raise ValueError("`processing_class` must be provided.") + + # Convert to PEFT model if peft_config is provided + if False: + # Check if PEFT is available + if not is_peft_available(): + raise ImportError( + "PEFT is not available and passed `peft_config`. Please install PEFT with " + "`pip install peft` to use it." + ) + + # If the model is already a PeftModel, we need to merge and unload it. + # Further information here: https://huggingface.co/docs/trl/dpo_trainer#reference-model-considerations-with-peft + if isinstance(model, PeftModel): + model = model.merge_and_unload() + + # Get peft model with the given config + model = model + + # Disable dropout in the model and reference model + if args.disable_dropout: + disable_dropout_in_model(model) + if self.ref_model is not None: + disable_dropout_in_model(self.ref_model) + + # Handle the ref_model + # Usually, the user wants the ref model to be the initial version of the model. When using PEFT, it's easy to + # get the ref model, as it's just the model with a disabled adapter. When not using PEFT, we need to create + # the ref model from the model by copying it and disable the gradients and set it in evaluation mode. + if ref_model is None: # No ref model provided, the most common case + if False: + self.ref_model = create_reference_model(model) # copy, disable gradients, set eval mode + else: + self.ref_model = None # we don't need a ref model here, we can just disable the adapter. + else: # rare case, the user provided a ref model + self.ref_model = ref_model + self.ref_model.eval() + + # Disable the gradient and set the reward model in eval mode + if self.reward_model is not None: + self.reward_model.eval() + + # Define the collator is not provided + if data_collator is None: + data_collator = DPODataCollatorWithPadding(pad_token_id=processing_class.pad_token_id) + + self.max_length = args.max_length + + self.stats = { + "objective/kl": [], + "objective/entropy": [], + "objective/non_score_reward": [], + "rewards/chosen": [], + "rewards/rejected": [], + "rewards/accuracies": [], + "rewards/margins": [], + "logps/chosen": [], + "logps/rejected": [], + "val/contain_eos_token": [], + "beta": [], + } + if self.reward_model is not None: + self.stats["objective/rlhf_reward"] = [] + self.stats["objective/scores_margin"] = [] + self.stats["objective/scores"] = [] + + if args.use_vllm: + self.llm = model.vllm_engine; self._last_loaded_step = 0; self.generation_config = SamplingParams( + n=2, + max_tokens=args.max_new_tokens, + temperature=args.temperature, + top_k=50, + top_p=1.0, + detokenize=False, + **getattr(getattr(args, 'vllm_sampling_params', vLLMSamplingParams()), '_set_kwargs', {}), + ) + else: + self.generation_config = GenerationConfig( + max_new_tokens=args.max_new_tokens, + temperature=args.temperature, + top_k=50, + top_p=1.0, + do_sample=True, + use_cache=False if args.gradient_checkpointing else True, + ) + + # The trainer estimates the number of FLOPs [floating-point operations] using the number of elements in the + # input tensor associated with the key "input_ids". However, in Online DPO, the sampled data does not include + # the "input_ids" key. As a result, the trainer issues the warning: "Could not estimate the number of tokens + # of the input, floating-point operations will not be computed." To suppress this warning, we set the + # "estimate_tokens" key in the model's "warnings_issued" dictionary to True. This acts as a flag to indicate + # that the warning has already been issued. + model.warnings_issued["estimate_tokens"] = True + + super().__init__( + model=model, + args=args, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + processing_class=processing_class, + compute_metrics=compute_metrics, + callbacks=callbacks, + optimizers=optimizers, + preprocess_logits_for_metrics=preprocess_logits_for_metrics, + ) + + # Add tags for models that have been loaded with the correct transformers version + if hasattr(self.model, "add_model_tags"): + self.model.add_model_tags(self._tag_names) + + self._beta = args.beta + + # Placed after the super[].__init__ because we need self.is_deepspeed_enabled and self.accelerator + if self.is_deepspeed_enabled: + if self.reward_model is not None: + self.reward_model = prepare_deepspeed( + self.reward_model, args.per_device_train_batch_size, args.fp16, args.bf16 + ) + if self.ref_model is not None: + self.ref_model = prepare_deepspeed( + self.ref_model, args.per_device_train_batch_size, args.fp16, args.bf16 + ) + else: + if self.ref_model is not None: + self.ref_model = self.ref_model.to(self.accelerator.device) + if self.reward_model is not None: + self.reward_model = self.reward_model.to(self.accelerator.device) + + @property + def beta(self): + if isinstance(self._beta, list): + epoch = self.state.epoch + return self._beta[epoch] if epoch < len(self._beta) else self._beta[-1] + else: + return self._beta + + @staticmethod + def tokenize_row(feature, is_encoder_decoder: bool, tokenizer: PreTrainedTokenizerBase) -> dict[str, Any]: + """Tokenize a single row from a DPO specific dataset.""" + if not is_encoder_decoder: + batch = tokenizer(feature["prompt"], add_special_tokens=False) + # Add BOS token to head of prompt. Avoid adding if it's already there + if tokenizer.bos_token_id is not None: + prompt_len_input_ids = len(batch["input_ids"]) + if prompt_len_input_ids == 0 or tokenizer.bos_token_id != batch["input_ids"][0]: + batch["input_ids"] = [tokenizer.bos_token_id] + batch["input_ids"] + batch["attention_mask"] = [1] + batch["attention_mask"] + else: + batch = tokenizer(feature["prompt"], add_special_tokens=True) + batch = {f"prompt_{key}": value for key, value in batch.items()} + return batch + + # Same as Trainer.get_train_dataloader but skip the "remove_unused_columns". + @wraps(Trainer.get_train_dataloader) + def get_train_dataloader(self) -> DataLoader: + if self.train_dataset is None: + raise ValueError("Trainer: training requires a train_dataset.") + + train_dataset = self.train_dataset + data_collator = self.data_collator + dataloader_params = { + "batch_size": self._train_batch_size, + "collate_fn": data_collator, + "num_workers": self.args.dataloader_num_workers, + "pin_memory": self.args.dataloader_pin_memory, + "persistent_workers": self.args.dataloader_persistent_workers, + } + + if not isinstance(train_dataset, torch.utils.data.IterableDataset): + dataloader_params["sampler"] = self._get_train_sampler() + dataloader_params["drop_last"] = self.args.dataloader_drop_last + dataloader_params["worker_init_fn"] = seed_worker + dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor + + return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params)) + + # Same as Trainer.get_eval_dataloader but skip the "remove_unused_columns". + @wraps(Trainer.get_eval_dataloader) + def get_eval_dataloader(self, eval_dataset: Optional[Union[str, Dataset]] = None) -> DataLoader: + if eval_dataset is None and self.eval_dataset is None: + raise ValueError("Trainer: evaluation requires an eval_dataset.") + + # If we have persistent workers, don't do a fork bomb especially as eval datasets + # don't change during training + dataloader_key = eval_dataset if isinstance(eval_dataset, str) else "eval" + if ( + hasattr(self, "_eval_dataloaders") + and dataloader_key in self._eval_dataloaders + and self.args.dataloader_persistent_workers + ): + return self.accelerator.prepare(self._eval_dataloaders[dataloader_key]) + + eval_dataset = ( + self.eval_dataset[eval_dataset] + if isinstance(eval_dataset, str) + else eval_dataset + if eval_dataset is not None + else self.eval_dataset + ) + data_collator = self.data_collator + + dataloader_params = { + "batch_size": self.args.eval_batch_size, + "collate_fn": data_collator, + "num_workers": self.args.dataloader_num_workers, + "pin_memory": self.args.dataloader_pin_memory, + "persistent_workers": self.args.dataloader_persistent_workers, + } + + if not isinstance(eval_dataset, torch.utils.data.IterableDataset): + dataloader_params["sampler"] = self._get_eval_sampler(eval_dataset) + dataloader_params["drop_last"] = self.args.dataloader_drop_last + dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor + + # accelerator.free_memory() will destroy the references, so + # we need to store the non-prepared version + eval_dataloader = DataLoader(eval_dataset, **dataloader_params) + if self.args.dataloader_persistent_workers: + if hasattr(self, "_eval_dataloaders"): + self._eval_dataloaders[dataloader_key] = eval_dataloader + else: + self._eval_dataloaders = {dataloader_key: eval_dataloader} + + return self.accelerator.prepare(eval_dataloader) + + def _generate_vllm(self, model, prompts): + eos_token_id = self.processing_class.eos_token_id + pad_token_id = self.processing_class.pad_token_id + + # Load the latest weights + + pass + + pass + + if is_conversational({"prompt": prompts[0]}): + outputs = self.llm.chat(prompts, self.generation_config, use_tqdm=False, lora_request = self.model.load_lora('online_dpo_trainer_lora_model', load_tensors = True)) + else: + outputs = self.llm.generate(prompts, self.generation_config, use_tqdm=False, lora_request = self.model.load_lora('online_dpo_trainer_lora_model', load_tensors = True)) + + completion_ids = [list(output.outputs[i].token_ids) for i in range(2) for output in outputs] + prompt_ids = [list(output.prompt_token_ids) for _ in range(2) for output in outputs] + + # Create mask and pad the prompt and completion + max_prompt_length = max(len(ids) for ids in prompt_ids) + prompt_mask = [[0] * (max_prompt_length - len(ids)) + [1] * len(ids) for ids in prompt_ids] + prompt_ids = [[pad_token_id] * (max_prompt_length - len(ids)) + ids for ids in prompt_ids] + max_tokens = self.generation_config.max_tokens + completion_mask = [[1] * len(ids) + [0] * (max_tokens - len(ids)) for ids in completion_ids] + completion_ids = [ + ids + [eos_token_id] if ids[-1] != eos_token_id and len(ids) < max_tokens else ids + for ids in completion_ids + ] + completion_ids = [ids + [pad_token_id] * (max_tokens - len(ids)) for ids in completion_ids] + + # Convert to tensors + prompt_ids = torch.tensor(prompt_ids, device=self.accelerator.device) + prompt_mask = torch.tensor(prompt_mask, device=self.accelerator.device) + completion_ids = torch.tensor(completion_ids, device=self.accelerator.device) + completion_mask = torch.tensor(completion_mask, device=self.accelerator.device) + + return prompt_ids, prompt_mask, completion_ids, completion_mask + + def _generate(self, model, prompts): + eos_token_id = self.processing_class.eos_token_id + pad_token_id = self.processing_class.pad_token_id + + # Apply chat template and tokenize the input. We do this on-the-fly to enable the use of reward models and + # policies with different tokenizers / chat templates. + inputs = [{"prompt": prompt} for prompt in prompts] + inputs = [maybe_apply_chat_template(x, self.processing_class) for x in inputs] + inputs = [self.tokenize_row(x, self.is_encoder_decoder, self.processing_class) for x in inputs] + inputs = self.data_collator(inputs) + + # Sample 2 completions per prompt of size `max_new_tokens` from the model + inputs = self._prepare_inputs(inputs) + prompt_ids = inputs["prompt_input_ids"].repeat(2, 1) + prompt_mask = inputs["prompt_attention_mask"].repeat(2, 1) + with unwrap_model_for_generation( + model, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation + ) as unwrapped_model: + output = unwrapped_model.generate( + input_ids=prompt_ids, + attention_mask=prompt_mask, + generation_config=self.generation_config, + ) + + completion_ids = output[:, prompt_ids.size(1) :] + completion_ids, completion_mask = truncate_right(completion_ids, eos_token_id, pad_token_id) + + return prompt_ids, prompt_mask, completion_ids, completion_mask + + def _forward(self, model, prompt_ids, prompt_mask, completion_ids, completion_mask): + # Get the number of tokens to truncate from prompt + num_tokens_to_truncate = max(prompt_ids.size(1) + completion_ids.size(1) - self.max_length, 0) + + # Truncate left to avoid oom + prompt_ids = prompt_ids[:, num_tokens_to_truncate:] + prompt_mask = prompt_mask[:, num_tokens_to_truncate:] + + # Concat the prompt and completion + prompt_completion_ids = torch.cat((prompt_ids, completion_ids), dim=1) + prompt_completion_mask = torch.cat((prompt_mask, completion_mask), dim=1) + + # Get the logprobs of the completions from the model + output = model(prompt_completion_ids, attention_mask=prompt_completion_mask) + + # There is 1 offset, because the model predict the next token + logits = output.logits[:, prompt_ids.size(1) - 1 : -1] + + # Take the completion tokens logprob + logprobs = torch.take_along_dim(logits.log_softmax(dim=-1), completion_ids.unsqueeze(-1), dim=2).squeeze(-1) + return logprobs + + def training_step( + self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], num_items_in_batch: Optional[int] = None + ) -> torch.Tensor: + model.train() + + prompts = inputs["prompt"] + batch_size = len(prompts) + + if self.args.use_vllm: + prompt_ids, prompt_mask, completion_ids, completion_mask = self._generate_vllm(model, prompts) + else: + prompt_ids, prompt_mask, completion_ids, completion_mask = self._generate(model, prompts) + + contain_eos_token = torch.any(completion_ids == self.processing_class.eos_token_id, dim=-1) + + logprobs = self._forward(model, prompt_ids, prompt_mask, completion_ids, completion_mask) + with torch.no_grad(): + if self.ref_model is not None: + ref_logprobs = self._forward(self.ref_model, prompt_ids, prompt_mask, completion_ids, completion_mask) + else: # peft case: we just need to disable the adapter + with self.model.disable_adapter(): + ref_logprobs = self._forward(self.model, prompt_ids, prompt_mask, completion_ids, completion_mask) + + # Decode the completions, and format them if the input is conversational + device = logprobs.device + completions = self.processing_class.batch_decode(completion_ids, skip_special_tokens=True) + if is_conversational({"prompt": prompts[0]}): + completions = [[{"role": "assistant", "content": completion}] for completion in completions] + + # Get the reward from the reward model or judge + if self.judge is not None: + # Once formatted, conversational data may contain special tokens (such as <|im_start|>) that are not + # directly understandable by the judge and could alter its judgment. To avoid this and make the judge + # independent of the model's chat template, we use the raw conversation data, and apply our own chat + # template to it. + if is_conversational({"prompt": prompts[0]}): + environment = jinja2.Environment() + template = environment.from_string(SIMPLE_CHAT_TEMPLATE) + prompts = [template.render(messages=prompt) for prompt in prompts] + completions = [template.render(messages=completion) for completion in completions] + + ranks_of_first_completion = self.judge.judge( + prompts, list(zip(completions[:batch_size], completions[batch_size:])) + ) + + # convert ranks to a True/False mask: + # when rank == 0, it means the first completion is the best + # when rank == 1, it means the second completion is the best + mask = torch.tensor([rank == 0 for rank in ranks_of_first_completion], device=device) + else: + # The reward model may not have the same chat template or tokenizer as the model, so we need to use the + # raw data (string), apply the chat template (if needed), and tokenize it with the reward processing class. + prompts = 2 * prompts # repeat the prompt: [prompt0, prompt1] -> [prompt0, prompt1, prompt0, prompt1] + if is_conversational({"prompt": prompts[0]}): + examples = [{"prompt": p, "completion": c} for p, c in zip(prompts, completions)] + examples = [apply_chat_template(example, self.reward_processing_class) for example in examples] + prompts = [example["prompt"] for example in examples] + completions = [example["completion"] for example in examples] + + # Tokenize the prompts + prompts_ids = self.reward_processing_class( + prompts, padding=True, return_tensors="pt", padding_side="left" + )["input_ids"].to(device) + context_length = prompts_ids.shape[1] + + # Tokenize the completions + completions_ids = self.reward_processing_class( + completions, padding=True, return_tensors="pt", padding_side="right" + )["input_ids"].to(device) + + # Concatenate the prompts and completions and get the reward + prompt_completion_ids = torch.cat((prompts_ids, completions_ids), dim=1) + with torch.inference_mode(): + _, scores, _ = get_reward( + self.reward_model, prompt_completion_ids, self.reward_processing_class.pad_token_id, context_length + ) + + # Filter completion. Ensure that the sample contains stop_token_id + # Completions not passing that filter will receive a lower score. + if self.args.missing_eos_penalty is not None: + scores[~contain_eos_token] -= self.args.missing_eos_penalty + + # Split the scores in 2 (the prompts of the first half are the same as the second half) + first_half, second_half = scores.split(batch_size) + + # Get the indices of the chosen and rejected examples + mask = first_half >= second_half + + batch_range = torch.arange(batch_size, device=device) + chosen_indices = batch_range + (~mask * batch_size) + rejected_indices = batch_range + (mask * batch_size) + + # Build tensor so that the first half is the chosen examples and the second half the rejected examples + cr_indices = torch.cat((chosen_indices, rejected_indices), dim=0) # cr = chosen and rejected + cr_logprobs = logprobs[cr_indices] + cr_ref_logprobs = ref_logprobs[cr_indices] + + # mask out the padding tokens + padding_mask = ~completion_mask.bool() + cr_padding_mask = padding_mask[cr_indices] + + cr_logprobs_sum = (cr_logprobs * ~cr_padding_mask).sum(1) + cr_ref_logprobs_sum = (cr_ref_logprobs * ~cr_padding_mask).sum(1) + + # Split the chosen and rejected examples + chosen_logprobs_sum, rejected_logprobs_sum = torch.split(cr_logprobs_sum, batch_size) + chosen_ref_logprobs_sum, rejected_ref_logprobs_sum = torch.split(cr_ref_logprobs_sum, batch_size) + pi_logratios = chosen_logprobs_sum - rejected_logprobs_sum + ref_logratios = chosen_ref_logprobs_sum - rejected_ref_logprobs_sum + + logits = pi_logratios - ref_logratios + + if self.args.loss_type == "sigmoid": + losses = -F.logsigmoid(self.beta * logits) + elif self.args.loss_type == "ipo": + losses = (logits - 1 / (2 * self.beta)) ** 2 + else: + raise NotImplementedError(f"invalid loss type {self.loss_type}") + + loss = losses.mean() + + # Log everything + if self.reward_model is not None: + scores_margin = scores[chosen_indices] - scores[rejected_indices] + self.stats["objective/scores_margin"].append( + self.accelerator.gather_for_metrics(scores_margin.mean()).mean().item() + ) + self.stats["objective/scores"].append(self.accelerator.gather_for_metrics(scores.mean()).mean().item()) + self.stats["val/contain_eos_token"].append(contain_eos_token.float().mean().item()) + self.stats["logps/chosen"].append(self.accelerator.gather_for_metrics(chosen_logprobs_sum).mean().item()) + self.stats["logps/rejected"].append(self.accelerator.gather_for_metrics(rejected_logprobs_sum).mean().item()) + + kl = logprobs - ref_logprobs + mean_kl = kl.sum(1).mean() + self.stats["objective/kl"].append(self.accelerator.gather_for_metrics(mean_kl).mean().item()) + non_score_reward = (-self.beta * kl).sum(1) + mean_non_score_reward = non_score_reward.mean() + self.stats["objective/non_score_reward"].append( + self.accelerator.gather_for_metrics(mean_non_score_reward).mean().item() + ) + if self.reward_model is not None: + rlhf_reward = scores + non_score_reward + self.stats["objective/rlhf_reward"].append(self.accelerator.gather_for_metrics(rlhf_reward).mean().item()) + mean_entropy = -logprobs.sum(1).mean() + self.stats["objective/entropy"].append(self.accelerator.gather_for_metrics(mean_entropy).mean().item()) + chosen_rewards = self.beta * (chosen_logprobs_sum - chosen_ref_logprobs_sum) + gathered_chosen_rewards = self.accelerator.gather_for_metrics(chosen_rewards) + self.stats["rewards/chosen"].append(gathered_chosen_rewards.mean().item()) + rejected_rewards = self.beta * (rejected_logprobs_sum - rejected_ref_logprobs_sum) + gathered_rejected_rewards = self.accelerator.gather_for_metrics(rejected_rewards) + self.stats["rewards/rejected"].append(gathered_rejected_rewards.mean().item()) + margin = gathered_chosen_rewards - gathered_rejected_rewards + self.stats["rewards/margins"].append(margin.mean().item()) + accuracy = margin > 0 + self.stats["rewards/accuracies"].append(accuracy.float().mean().item()) + self.stats["beta"].append(self.beta) + + if ( + self.args.torch_empty_cache_steps is not None + and self.state.global_step % self.args.torch_empty_cache_steps == 0 + ): + empty_cache() + + kwargs = {} + + # For LOMO optimizers you need to explicitly use the learning rate + if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]: + kwargs["learning_rate"] = self._get_learning_rate() + + if self.args.n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu parallel training + + if self.use_apex: + with amp.scale_loss(loss, self.optimizer) as scaled_loss: + scaled_loss.backward() + else: + self.accelerator.backward(loss, **kwargs) + + return loss.detach() / self.args.gradient_accumulation_steps + + # Same as Trainer._maybe_log_save_evaluate but log our metrics + def _maybe_log_save_evaluate( + self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time, learning_rate=None + ): + if self.control.should_log and self.state.global_step > self._globalstep_last_logged: + logs: dict[str, float] = {} + + # all_gather + mean() to get average loss over all processes + tr_loss_scalar = self._nested_gather(tr_loss).mean().item() + + # reset tr_loss to zero + tr_loss -= tr_loss + + logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4) + if grad_norm is not None: + logs["grad_norm"] = grad_norm.detach().item() if isinstance(grad_norm, torch.Tensor) else grad_norm + if learning_rate is not None: + logs["learning_rate"] = learning_rate + else: + logs["learning_rate"] = self._get_learning_rate() + + # Add our metrics + for key, val in self.stats.items(): + logs[key] = sum(val) / len(val) + self.stats = {key: [] for key in self.stats} # reset stats + + self._total_loss_scalar += tr_loss_scalar + self._globalstep_last_logged = self.state.global_step + self.store_flos() + self.log(logs, start_time) + + metrics = None + if self.control.should_evaluate: + metrics = self._evaluate(trial, ignore_keys_for_eval) + is_new_best_metric = self._determine_best_metric(metrics=metrics, trial=trial) + + if self.args.save_strategy == "best": + self.control.should_save = is_new_best_metric + + if self.control.should_save: + self._save_checkpoint(model, trial) + self.control = self.callback_handler.on_save(self.args, self.state, self.control) + + # Ensure the model card is saved along with the checkpoint + def _save_checkpoint(self, model, trial): + if self.args.hub_model_id is None: + model_name = Path(self.args.output_dir).name + else: + model_name = self.args.hub_model_id.split("/")[-1] + self.create_model_card(model_name=model_name) + super()._save_checkpoint(model, trial) + + def create_model_card( + self, + model_name: Optional[str] = None, + dataset_name: Optional[str] = None, + tags: Union[str, list[str], None] = None, + ): + """ + Creates a draft of a model card using the information available to the `Trainer`. + + Args: + model_name (`str` or `None`, *optional*, defaults to `None`): + Name of the model. + dataset_name (`str` or `None`, *optional*, defaults to `None`): + Name of the dataset used for training. + tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`): + Tags to be associated with the model card. + """ + if not self.is_world_process_zero(): + return + + if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path): + base_model = self.model.config._name_or_path + else: + base_model = None + + # normalize `tags` to a mutable set + if tags is None: + tags = set() + elif isinstance(tags, str): + tags = {tags} + else: + tags = set(tags) + + if hasattr(self.model.config, "unsloth_version"): + tags.add("unsloth") + + tags.update(self._tag_names) + + citation = textwrap.dedent("""\ + @article{guo2024direct, + title = {{Direct Language Model Alignment from Online AI Feedback}}, + author = {Shangmin Guo and Biao Zhang and Tianlin Liu and Tianqi Liu and Misha Khalman and Felipe Llinares and Alexandre Ram{\'{e}} and Thomas Mesnard and Yao Zhao and Bilal Piot and Johan Ferret and Mathieu Blondel}, + year = 2024, + eprint = {arXiv:2402.04792} + }""") + + model_card = generate_model_card( + base_model=base_model, + model_name=model_name, + hub_model_id=self.hub_model_id, + dataset_name=dataset_name, + tags=tags, + wandb_url=wandb.run.get_url() if is_wandb_available() and wandb.run is not None else None, + comet_url=get_comet_experiment_url(), + trainer_name="Online DPO", + trainer_citation=citation, + paper_title="Direct Language Model Alignment from Online AI Feedback", + paper_id="2402.04792", + ) + model_card.save(os.path.join(self.args.output_dir, "README.md")) +class UnslothOnlineDPOTrainer(_UnslothOnlineDPOTrainer): + """ + + Initialize OnlineDPOTrainer. + + Args: + model (`transformers.PreTrainedModel` or `torch.nn.Module`): + The model to train, preferably an `AutoModelForCausalLM`. + ref_model (`transformers.PreTrainedModel` or `torch.nn.Module` or `None`): + The reference model to use for training. If None is specified, the reference model will be created from the + model. + reward_model (`transformers.PreTrainedModel` or `torch.nn.Module` or `None`): + The reward model to score completions with, preferably an `AutoModelForSequenceClassification`. + judge (`BasePairwiseJudge`): + The judge to use for pairwise comparison of model completions. + args (`OnlineDPOConfig`): + The online DPO config arguments to use for training. + data_collator (`transformers.DataCollator`): + The data collator to use for training. If None is specified, the default data collator + (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the + sequences in the batch, given a dataset of paired sequences. + train_dataset (`datasets.Dataset`): + The dataset to use for training. + eval_dataset (`datasets.Dataset`): + The dataset to use for evaluation. + processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*): + Processing class used to process the data. If provided, will be used to automatically process the inputs + for the model, and it will be saved along the model to make it easier to rerun an interrupted training or + reuse the fine-tuned model. + peft_config (`dict`): + The peft config to use for training. + compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*): + The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to + metric values. + callbacks (`list[transformers.TrainerCallback]`): + The callbacks to use for training. + optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`): + The optimizer and scheduler to use for training. + preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`): + The function to use to preprocess the logits before computing the metrics. + + """ + def __init__( + self, + model, + ref_model = None, + reward_model = None, + judge = None, + args = None, + data_collator = None, + train_dataset = None, + eval_dataset = None, + processing_class = None, + reward_processing_class = None, + peft_config = None, + compute_metrics = None, + callbacks = None, + preprocess_logits_for_metrics = None, + **kwargs + ): + if args is None: args = UnslothOnlineDPOConfig() + use_bf16 = getattr(args, 'bf16', False) + if type(use_bf16) is not bool: use_bf16 = False + use_fp16 = getattr(args, 'fp16', False) + if type(use_fp16) is not bool: use_fp16 = False + force_float32 = False + if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1': + print('Unsloth: Switching to float32 training since model cannot work with float16') + force_float32 = True + mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') + dtype = getattr(model.config, 'torch_dtype', None) + if dtype is None: dtype = model.get_input_embeddings().dtype + from unsloth_zoo.utils import _get_dtype + dtype = _get_dtype(dtype) + float16 = dtype == torch.float16 + if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`') + if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`') + if force_float32: + args.fp16 = False + args.bf16 = False + os.environ['ACCELERATE_MIXED_PRECISION'] = 'no' + elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32': + args.fp16 = float16 + args.bf16 = not float16 + os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16' + if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no': + args.eval_strategy = 'steps' + if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1 + ga_steps = getattr(args, 'gradient_accumulation_steps', None) + if ga_steps is not None and ga_steps > 1: + from transformers import __version__ as transformers_version + if Version(transformers_version) <= Version('4.45.2'): + print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n' + '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`') + if getattr(args, 'eval_strategy', 'no') != 'no': + eval_bsz = getattr(args, 'per_device_eval_batch_size', 8) + if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size + if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps + fp16_full_eval = getattr(args, 'fp16_full_eval', False) + if type(fp16_full_eval) is not bool: fp16_full_eval = False + bf16_full_eval = getattr(args, 'bf16_full_eval', False) + if type(bf16_full_eval) is not bool: bf16_full_eval = False + if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True + if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False + if force_float32: + args.bf16_full_eval = False + args.fp16_full_eval = False + elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16': + args.bf16_full_eval = True + args.fp16_full_eval = False + elif not bf16_full_eval and not fp16_full_eval: + args.bf16_full_eval = args.bf16 + args.fp16_full_eval = args.fp16 + _output_logits = False + if locals().get('compute_metrics', None) is not None: _output_logits = True + if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True + if _output_logits: + os.environ['UNSLOTH_RETURN_LOGITS'] = '1' + if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'): + pass + else: + model_max_seq_length = getattr(model, 'max_seq_length', None) + args_max_seq_length = getattr(args, 'max_seq_length', None) + if args_max_seq_length is None and model_max_seq_length is not None: + max_seq_length = model.max_seq_length + if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length + if model is not None and hasattr(model, 'for_training'): + model.for_training() + if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right' + if 'processing_class' in locals(): + if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right' + if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right' + __tokenizer = processing_class if 'processing_class' in locals() else tokenizer + from unsloth_zoo.vision_utils import UnslothVisionDataCollator + if not isinstance(data_collator, UnslothVisionDataCollator): + if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer, mlm = False, mlm_probability = 0.0) + elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names: + data_collator = DataCollatorForSeq2Seq(__tokenizer) + else: + if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False + if hasattr(args, 'dataset_text_field'): args.dataset_text_field = '' + if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True} + if not isinstance(data_collator, UnslothVisionDataCollator): + if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'): + if isinstance(data_collator, DataCollatorForSeq2Seq): + data_collator = DataCollatorForSeq2Seq(__tokenizer.tokenizer) + else: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer.tokenizer, mlm = False, mlm_probability = 0.0) + other_metrics = [] + + from unsloth_zoo.logging_utils import PatchRLStatistics + PatchRLStatistics('online_dpo_trainer', other_metrics) + + super().__init__( + model = model, + ref_model = ref_model, + reward_model = reward_model, + judge = judge, + args = args, + data_collator = data_collator, + train_dataset = train_dataset, + eval_dataset = eval_dataset, + processing_class = processing_class, + reward_processing_class = reward_processing_class, + peft_config = peft_config, + compute_metrics = compute_metrics, + callbacks = callbacks, + preprocess_logits_for_metrics = preprocess_logits_for_metrics,**kwargs) + if hasattr(self, 'neftune_hook_handle'): + self.neftune_hook_handle.remove() + if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle + if getattr(args, 'neftune_noise_alpha', None) is not None: + model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha + pass + +pass diff --git a/unsloth_compiled_cache/UnslothPPOTrainer.py b/unsloth_compiled_cache/UnslothPPOTrainer.py new file mode 100644 index 0000000000000000000000000000000000000000..d2b86db4082fbdff10bb536640f16e2586648226 --- /dev/null +++ b/unsloth_compiled_cache/UnslothPPOTrainer.py @@ -0,0 +1,1300 @@ +""" +2025.7.4 +2025.7.3 +4.53.2 +0.19.1 +__UNSLOTH_VERSIONING__ +""" +from torch import Tensor +import torch +import torch.nn as nn +from torch.nn import functional as F +from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable +from trl.trainer.ppo_trainer import (Accelerator, BaseImageProcessor, CallbackHandler, DEFAULT_CALLBACKS, DEFAULT_PROGRESS_CALLBACK, DataCollatorWithPadding, DataLoader, Dataset, ExportableState, FeatureExtractionMixin, GenerationConfig, INVALID_LOGPROB, OnlineTrainerState, Optional, PPOConfig, PPOTrainer, Path, PeftConfig, PeftModel, PolicyAndValueWrapper, PreTrainedTokenizerBase, PrinterCallback, ProcessorMixin, Trainer, TrainerCallback, TrainerControl, Union, batch_generation, broadcast, contextmanager, create_reference_model, defaultdict, disable_dropout_in_model, empty_cache, exact_div, first_true_indices, forward, gather_object, gc, generate_model_card, get_comet_experiment_url, get_peft_model, get_reporting_integration_callbacks, get_reward, is_peft_available, is_rich_available, is_wandb_available, log_table_to_comet_experiment, masked_mean, masked_whiten, math, nn, np, nullcontext, os, pd, peft_module_casting_to_bf16, prepare_deepspeed, print_rich_table, textwrap, time, torch, truncate_response, unwrap_model_for_generation, wandb, Optional, PeftModel, Trainer, is_peft_available, os, torch) + + +import os +from typing import * +from dataclasses import dataclass, field +from packaging.version import Version +import torch +import numpy as np +from contextlib import nullcontext +from torch.nn import functional as F +from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling + +torch_compile_options = { + "epilogue_fusion" : True, + "max_autotune" : False, + "shape_padding" : True, + "trace.enabled" : False, + "triton.cudagraphs" : False, +} + +@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) +def selective_log_softmax(logits, index): + logits = logits.to(torch.float32) + selected_logits = torch.gather(logits, dim = -1, index = index.unsqueeze(-1)).squeeze(-1) + # loop to reduce peak mem consumption + # logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits]) + logsumexp_values = torch.logsumexp(logits, dim = -1) + per_token_logps = selected_logits - logsumexp_values # log_softmax(x_i) = x_i - logsumexp(x) + return per_token_logps +@dataclass +class UnslothPPOConfig(PPOConfig): + """ + + Configuration class for the [`PPOTrainer`]. + + This class includes only the parameters that are specific to PPO training. For a full list of training arguments, + please refer to the [`~transformers.TrainingArguments`] and [`OnPolicyConfig`] documentation. Note that default + values in this class may differ from those in [`~transformers.TrainingArguments`]. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + exp_name (`str`, *optional*, defaults to `os.path.basename(__file__)[:-3]`): + Name of this experiment. + reward_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`): + Path to the reward model. + model_adapter_name (`str` or `None`, *optional*, defaults to `None`): + Name of the train target PEFT adapter, when using LoRA with multiple adapters. + ref_adapter_name (`str` or `None`, *optional*, defaults to `None`): + Name of the reference PEFT adapter, when using LoRA with multiple adapters. + num_ppo_epochs (`int`, *optional*, defaults to `4`): + Number of epochs to train. + whiten_rewards (`bool`, *optional*, defaults to `False`): + Whether to whiten the rewards. + kl_coef (`float`, *optional*, defaults to `0.05`): + KL coefficient. + kl_estimator (`Literal["k1", "k3"]`, *optional*, defaults to `"k1"`): + Which estimator for KL-Divergence to use from [Approximating KL + Divergence](http://joschu.net/blog/kl-approx.html). Defaults to "k1", a straightforward, unbiased + estimator. Can be set to "k3", an unbiased estimator with lower variance which "appears to be a strictly + better estimator". Cannot be set to "k2", as it is used for logging purposes. + cliprange (`float`, *optional*, defaults to `0.2`): + Clip range. + vf_coef (`float`, *optional*, defaults to `0.1`): + Value function coefficient. + cliprange_value (`float`, *optional*, defaults to `0.2`): + Clip range for the value function. + gamma (`float`, *optional*, defaults to `1.0`): + Discount factor. + lam (`float`, *optional*, defaults to `0.95`): + Lambda value for GAE. + ds3_gather_for_generation (`bool`, *optional*, defaults to `True`): + This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation, + improving generation speed. However, disabling this option allows training models that exceed the VRAM + capacity of a single GPU, albeit at the cost of slower generation. + + """ + vllm_sampling_params: Optional[Any] = field( + default = None, + metadata = {'help': 'vLLM SamplingParams'}, + ) + unsloth_num_chunks : Optional[int] = field( + default = -1, + metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'}, + ) + def __init__( + self, + output_dir = None, + overwrite_output_dir = None, + do_train = False, + do_eval = False, + do_predict = False, + eval_strategy = 'no', + prediction_loss_only = False, + per_device_train_batch_size = 4, + per_device_eval_batch_size = 4, + per_gpu_train_batch_size = None, + per_gpu_eval_batch_size = None, + gradient_accumulation_steps = 2, + eval_accumulation_steps = 2, + eval_delay = 0, + torch_empty_cache_steps = 250, + learning_rate = 5e-05, + weight_decay = 0.01, + adam_beta1 = 0.9, + adam_beta2 = 0.999, + adam_epsilon = 1e-08, + max_grad_norm = 1.0, + num_train_epochs = 3.0, + max_steps = -1, + lr_scheduler_type = 'linear', + warmup_ratio = 0.1, + warmup_steps = 0, + log_level = 'passive', + log_level_replica = 'warning', + log_on_each_node = True, + logging_dir = None, + logging_strategy = 'steps', + logging_first_step = False, + logging_steps = 1, + logging_nan_inf_filter = False, + save_strategy = 'steps', + save_steps = 500, + save_total_limit = None, + save_safetensors = True, + save_on_each_node = False, + save_only_model = False, + restore_callback_states_from_checkpoint = False, + no_cuda = False, + use_cpu = False, + use_mps_device = False, + seed = 3407, + data_seed = 3407, + jit_mode_eval = False, + use_ipex = False, + bf16 = False, + fp16 = False, + fp16_opt_level = 'O1', + half_precision_backend = 'auto', + bf16_full_eval = False, + fp16_full_eval = False, + tf32 = None, + local_rank = -1, + ddp_backend = None, + tpu_num_cores = None, + tpu_metrics_debug = False, + debug = '', + dataloader_drop_last = False, + eval_steps = None, + dataloader_num_workers = 0, + dataloader_prefetch_factor = None, + past_index = -1, + run_name = None, + disable_tqdm = None, + remove_unused_columns = True, + label_names = None, + load_best_model_at_end = False, + metric_for_best_model = None, + greater_is_better = None, + ignore_data_skip = False, + fsdp = '', + fsdp_min_num_params = 0, + fsdp_config = None, + fsdp_transformer_layer_cls_to_wrap = None, + accelerator_config = None, + deepspeed = None, + label_smoothing_factor = 0.0, + optim = 'adamw_8bit', + optim_args = None, + adafactor = False, + group_by_length = False, + length_column_name = 'length', + report_to = None, + ddp_find_unused_parameters = None, + ddp_bucket_cap_mb = None, + ddp_broadcast_buffers = None, + dataloader_pin_memory = True, + dataloader_persistent_workers = False, + skip_memory_metrics = True, + use_legacy_prediction_loop = False, + push_to_hub = False, + resume_from_checkpoint = None, + hub_model_id = None, + hub_strategy = 'every_save', + hub_token = None, + hub_private_repo = None, + hub_always_push = False, + hub_revision = None, + gradient_checkpointing = False, + gradient_checkpointing_kwargs = None, + include_inputs_for_metrics = False, + eval_do_concat_batches = True, + fp16_backend = 'auto', + push_to_hub_model_id = None, + push_to_hub_organization = None, + push_to_hub_token = None, + mp_parameters = '', + auto_find_batch_size = False, + full_determinism = False, + torchdynamo = None, + ray_scope = 'last', + ddp_timeout = 1800, + torch_compile = False, + torch_compile_backend = None, + torch_compile_mode = None, + include_tokens_per_second = False, + include_num_input_tokens_seen = False, + neftune_noise_alpha = None, + optim_target_modules = None, + batch_eval_metrics = False, + eval_on_start = False, + use_liger_kernel = False, + liger_kernel_config = None, + eval_use_gather_object = False, + average_tokens_across_devices = False, + dataset_num_proc = None, + num_mini_batches = 1, + total_episodes = None, + local_rollout_forward_batch_size = 64, + num_sample_generations = 10, + response_length = 53, + stop_token = None, + stop_token_id = None, + temperature = 0.7, + missing_eos_penalty = None, + sft_model_path = 'EleutherAI/pythia-160m', + world_size = None, + num_total_batches = None, + micro_batch_size = None, + local_batch_size = None, + batch_size = None, + local_mini_batch_size = None, + mini_batch_size = None, + exp_name = 'ppo_config', + reward_model_path = 'EleutherAI/pythia-160m', + model_adapter_name = None, + ref_adapter_name = None, + num_ppo_epochs = 4, + whiten_rewards = False, + kl_coef = 0.05, + kl_estimator = 'k1', + cliprange = 0.2, + vf_coef = 0.1, + cliprange_value = 0.2, + gamma = 1.0, + lam = 0.95, + ds3_gather_for_generation = True, + vllm_sampling_params = None, + unsloth_num_chunks = -1, + **kwargs, + ): + if learning_rate < 1e-7: raise FloatingPointError(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!') + if learning_rate > 1: raise OverflowError(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!') + if output_dir is None and save_strategy == 'steps' and save_steps == 500: + output_dir = 'unsloth_training_checkpoints' + save_strategy = 'no' + if dataset_num_proc is None: + from multiprocessing import cpu_count + dataset_num_proc = cpu_count() + if temperature <= 0: + raise MathError('Unsloth: Please set a positive non-zero temperature since your results will be wrong.') + elif temperature >= 10: + raise MathError('Unsloth: Please set a positive non-zero temperature less than 10, since sampling will be quite erratic.') + + + super().__init__( + output_dir = output_dir, + overwrite_output_dir = overwrite_output_dir, + do_train = do_train, + do_eval = do_eval, + do_predict = do_predict, + eval_strategy = eval_strategy, + prediction_loss_only = prediction_loss_only, + per_device_train_batch_size = per_device_train_batch_size, + per_device_eval_batch_size = per_device_eval_batch_size, + per_gpu_train_batch_size = per_gpu_train_batch_size, + per_gpu_eval_batch_size = per_gpu_eval_batch_size, + gradient_accumulation_steps = gradient_accumulation_steps, + eval_accumulation_steps = eval_accumulation_steps, + eval_delay = eval_delay, + torch_empty_cache_steps = torch_empty_cache_steps, + learning_rate = learning_rate, + weight_decay = weight_decay, + adam_beta1 = adam_beta1, + adam_beta2 = adam_beta2, + adam_epsilon = adam_epsilon, + max_grad_norm = max_grad_norm, + num_train_epochs = num_train_epochs, + max_steps = max_steps, + lr_scheduler_type = lr_scheduler_type, + warmup_ratio = warmup_ratio, + warmup_steps = warmup_steps, + log_level = log_level, + log_level_replica = log_level_replica, + log_on_each_node = log_on_each_node, + logging_dir = logging_dir, + logging_strategy = logging_strategy, + logging_first_step = logging_first_step, + logging_steps = logging_steps, + logging_nan_inf_filter = logging_nan_inf_filter, + save_strategy = save_strategy, + save_steps = save_steps, + save_total_limit = save_total_limit, + save_safetensors = save_safetensors, + save_on_each_node = save_on_each_node, + save_only_model = save_only_model, + restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint, + no_cuda = no_cuda, + use_cpu = use_cpu, + use_mps_device = use_mps_device, + seed = seed, + data_seed = data_seed, + jit_mode_eval = jit_mode_eval, + use_ipex = use_ipex, + bf16 = bf16, + fp16 = fp16, + fp16_opt_level = fp16_opt_level, + half_precision_backend = half_precision_backend, + bf16_full_eval = bf16_full_eval, + fp16_full_eval = fp16_full_eval, + tf32 = tf32, + local_rank = local_rank, + ddp_backend = ddp_backend, + tpu_num_cores = tpu_num_cores, + tpu_metrics_debug = tpu_metrics_debug, + debug = debug, + dataloader_drop_last = dataloader_drop_last, + eval_steps = eval_steps, + dataloader_num_workers = dataloader_num_workers, + dataloader_prefetch_factor = dataloader_prefetch_factor, + past_index = past_index, + run_name = run_name, + disable_tqdm = disable_tqdm, + remove_unused_columns = remove_unused_columns, + label_names = label_names, + load_best_model_at_end = load_best_model_at_end, + metric_for_best_model = metric_for_best_model, + greater_is_better = greater_is_better, + ignore_data_skip = ignore_data_skip, + fsdp = fsdp, + fsdp_min_num_params = fsdp_min_num_params, + fsdp_config = fsdp_config, + fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap, + accelerator_config = accelerator_config, + deepspeed = deepspeed, + label_smoothing_factor = label_smoothing_factor, + optim = optim, + optim_args = optim_args, + adafactor = adafactor, + group_by_length = group_by_length, + length_column_name = length_column_name, + report_to = report_to, + ddp_find_unused_parameters = ddp_find_unused_parameters, + ddp_bucket_cap_mb = ddp_bucket_cap_mb, + ddp_broadcast_buffers = ddp_broadcast_buffers, + dataloader_pin_memory = dataloader_pin_memory, + dataloader_persistent_workers = dataloader_persistent_workers, + skip_memory_metrics = skip_memory_metrics, + use_legacy_prediction_loop = use_legacy_prediction_loop, + push_to_hub = push_to_hub, + resume_from_checkpoint = resume_from_checkpoint, + hub_model_id = hub_model_id, + hub_strategy = hub_strategy, + hub_token = hub_token, + hub_private_repo = hub_private_repo, + hub_always_push = hub_always_push, + hub_revision = hub_revision, + gradient_checkpointing = gradient_checkpointing, + gradient_checkpointing_kwargs = gradient_checkpointing_kwargs, + include_inputs_for_metrics = include_inputs_for_metrics, + eval_do_concat_batches = eval_do_concat_batches, + fp16_backend = fp16_backend, + push_to_hub_model_id = push_to_hub_model_id, + push_to_hub_organization = push_to_hub_organization, + push_to_hub_token = push_to_hub_token, + mp_parameters = mp_parameters, + auto_find_batch_size = auto_find_batch_size, + full_determinism = full_determinism, + torchdynamo = torchdynamo, + ray_scope = ray_scope, + ddp_timeout = ddp_timeout, + torch_compile = torch_compile, + torch_compile_backend = torch_compile_backend, + torch_compile_mode = torch_compile_mode, + include_tokens_per_second = include_tokens_per_second, + include_num_input_tokens_seen = include_num_input_tokens_seen, + neftune_noise_alpha = neftune_noise_alpha, + optim_target_modules = optim_target_modules, + batch_eval_metrics = batch_eval_metrics, + eval_on_start = eval_on_start, + use_liger_kernel = use_liger_kernel, + liger_kernel_config = liger_kernel_config, + eval_use_gather_object = eval_use_gather_object, + average_tokens_across_devices = average_tokens_across_devices, + dataset_num_proc = dataset_num_proc, + num_mini_batches = num_mini_batches, + total_episodes = total_episodes, + local_rollout_forward_batch_size = local_rollout_forward_batch_size, + num_sample_generations = num_sample_generations, + response_length = response_length, + stop_token = stop_token, + stop_token_id = stop_token_id, + temperature = temperature, + missing_eos_penalty = missing_eos_penalty, + sft_model_path = sft_model_path, + world_size = world_size, + num_total_batches = num_total_batches, + micro_batch_size = micro_batch_size, + local_batch_size = local_batch_size, + batch_size = batch_size, + local_mini_batch_size = local_mini_batch_size, + mini_batch_size = mini_batch_size, + exp_name = exp_name, + reward_model_path = reward_model_path, + model_adapter_name = model_adapter_name, + ref_adapter_name = ref_adapter_name, + num_ppo_epochs = num_ppo_epochs, + whiten_rewards = whiten_rewards, + kl_coef = kl_coef, + kl_estimator = kl_estimator, + cliprange = cliprange, + vf_coef = vf_coef, + cliprange_value = cliprange_value, + gamma = gamma, + lam = lam, + ds3_gather_for_generation = ds3_gather_for_generation,**kwargs) + self.vllm_sampling_params = vllm_sampling_params + self.unsloth_num_chunks = unsloth_num_chunks +pass + +class _UnslothPPOTrainer(Trainer): + _tag_names = ["trl", "ppo"] + + def __init__( + self, + args: PPOConfig, + processing_class: Optional[ + Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin] + ], + model: nn.Module, + ref_model: Optional[nn.Module], + reward_model: nn.Module, + train_dataset: Dataset, + value_model: nn.Module, + data_collator: Optional[DataCollatorWithPadding] = None, + eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None, + # less commonly used + optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), + callbacks: Optional[list[TrainerCallback]] = None, + peft_config: Optional["PeftConfig"] = None, + ) -> None: + if ref_model is model: + raise ValueError( + "`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the " + "same as `model`, you must make a copy of it, or `None` if you use peft." + ) + + self.args = args + self.processing_class = processing_class + self.policy_model = model + + # Define the collator if not provided + if data_collator is None: + data_collator = DataCollatorWithPadding(self.processing_class) + + # Handle stop token settings: update policy model's generation_config to use provided stop token + if args.stop_token and args.stop_token_id: + raise ValueError("You cannot set both `stop_token` and `stop_token_id`.") + elif args.stop_token: + if args.stop_token == "eos": + self.policy_model.generation_config.eos_token_id = self.stop_token_id = processing_class.eos_token_id + else: + raise ValueError( + f"Unknown `stop_token` {args.stop_token}. Allowed values are: `'eos'` and `None` (no stop token)." + ) + else: + self.policy_model.generation_config.eos_token_id = self.stop_token_id = args.stop_token_id # None or int + + # Check that the kl estimator is valid + if self.args.kl_estimator not in {"k1", "k3"}: + raise ValueError( + "kl_estimator must be either 'k1' (straightforward, unbiased) or 'k3' (lower variance, unbiased, " + "appears to be a strictly better estimator). See " + "[Approximating KL Divergence](http://joschu.net/blog/kl-approx.html) for details." + ) + + # peft support + if not is_peft_available() and peft_config is not None: + raise ImportError( + "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models" + ) + elif is_peft_available() and peft_config is not None: + # if model is a peft model and we have a peft_confg, we merge and unload it first + if isinstance(self.policy_model, PeftModel): + self.policy_model = self.policy_model.merge_and_unload() + + # get peft model with the given config + self.policy_model = get_peft_model(self.policy_model, peft_config) + if args.bf16 and getattr(self.policy_model, "is_loaded_in_4bit", False): + peft_module_casting_to_bf16(self.policy_model) + + self.is_peft_model = is_peft_available() and isinstance(self.policy_model, PeftModel) + self.model_adapter_name = args.model_adapter_name + self.ref_adapter_name = args.ref_adapter_name + + if ref_model: + self.ref_model = ref_model + elif self.is_peft_model: + self.ref_model = None + else: + self.ref_model = create_reference_model(self.policy_model) + + self.reward_model = reward_model + self.train_dataset = train_dataset + self.train_dataset_len = len(train_dataset) + self.value_model = value_model + self.data_collator = data_collator + self.eval_dataset = eval_dataset + self.optimizer, self.lr_scheduler = optimizers + self.optimizer_cls_and_kwargs = None # needed for transformers >= 4.47 + + ######### + # calculate various batch sizes + ######### + if args.total_episodes is None: # allow the users to define episodes in terms of epochs. + args.total_episodes = int(args.num_train_epochs * self.train_dataset_len) + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps) + self.accelerator = accelerator + args.world_size = accelerator.num_processes + args.local_batch_size = args.per_device_train_batch_size * args.gradient_accumulation_steps + args.micro_batch_size = int(args.per_device_train_batch_size * args.world_size) + args.batch_size = int(args.local_batch_size * args.world_size) + args.mini_batch_size = exact_div( + args.batch_size, args.num_mini_batches, "`batch_size` must be a multiple of `num_mini_batches`" + ) + args.local_mini_batch_size = exact_div( + args.local_batch_size, args.num_mini_batches, "`local_batch_size` must be a multiple of `num_mini_batches`" + ) + if args.whiten_rewards: + assert args.local_mini_batch_size >= 8, ( + f"Per-rank minibatch size {args.local_mini_batch_size} is insufficient for whitening" + ) + # `per_rank_rollout_batch_size` is our `args.local_batch_size` + # `per_rank_minibatch_size` is our `args.local_mini_batch_size` + args.num_total_batches = math.ceil( + args.total_episodes / args.batch_size + ) # we may train for more than `total_episodes` + time_tensor = torch.tensor(int(time.time()), device=accelerator.device) + time_int = broadcast(time_tensor, 0).item() # avoid different timestamps across processes + args.run_name = f"{args.exp_name}__{args.seed}__{time_int}" + self.local_seed = args.seed + accelerator.process_index * 100003 # Prime + if args.num_sample_generations > 0: + self.sample_generations_freq = max(1, args.num_total_batches // args.num_sample_generations) + self.local_dataloader_batch_size = args.local_batch_size + + ######### + # setup model, optimizer, and others + ######### + for module in [self.policy_model, self.ref_model, self.value_model, self.reward_model]: + if module is not None: + disable_dropout_in_model(module) + self.model = PolicyAndValueWrapper(self.policy_model, self.value_model) + self.model.config = self.policy_model.config # needed for pushing to hub + self.create_optimizer_and_scheduler( + num_training_steps=args.num_total_batches + ) # note that we are calling `self.lr_scheduler.step[]` manually only at the batch level + + ######### + ### trainer specifics + ######### + default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to) + self.callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks + self.callback_handler = CallbackHandler( + self.callbacks, self.model, self.processing_class, self.optimizer, self.lr_scheduler + ) + self.add_callback(PrinterCallback if self.args.disable_tqdm else DEFAULT_PROGRESS_CALLBACK) + self.control = TrainerControl() + self.state = OnlineTrainerState( + is_local_process_zero=self.is_local_process_zero(), + is_world_process_zero=self.is_world_process_zero(), + stateful_callbacks=[ + cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState) + ], + ) + self.current_flos = 0 + self.hp_search_backend = None + self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None + self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None + # Create distant repo and output directory if needed + self.hub_model_id = None + if self.args.push_to_hub: + self.init_hf_repo() + if self.args.should_save: + os.makedirs(self.args.output_dir, exist_ok=True) + + # Add tags for models that have been loaded with the correct transformers version + if hasattr(self.model, "add_model_tags"): + self.model.add_model_tags(self._tag_names) + + ######### + ### setup dataloader + ######### + self.dataloader = DataLoader( + self.train_dataset, + batch_size=self.local_dataloader_batch_size, + shuffle=True, + collate_fn=self.data_collator, + drop_last=True, # needed; otherwise the last batch will be of ragged shape + ) + # sync random states for DataLoader[shuffle=True] before `accelerator.prepare` + # see https://gist.github.com/vwxyzjn/2581bff1e48e185e0b85b6dfe1def79c + torch.manual_seed(args.seed) + self.model, self.optimizer, self.dataloader = accelerator.prepare(self.model, self.optimizer, self.dataloader) + torch.manual_seed(self.local_seed) # reset the local seed again + + self.eval_dataloader = DataLoader( + self.eval_dataset, + batch_size=args.per_device_eval_batch_size, + collate_fn=self.data_collator, + drop_last=True, + ) # no need to shuffle eval dataset + self.eval_dataloader = accelerator.prepare(self.eval_dataloader) + + if self.is_deepspeed_enabled: + self.reward_model = prepare_deepspeed( + self.reward_model, args.per_device_train_batch_size, args.fp16, args.bf16 + ) + + if self.ref_model is None: + if not self.is_peft_model: + raise ValueError("No reference model and model is not a Peft model.") + else: + self.ref_model = prepare_deepspeed( + self.ref_model, args.per_device_train_batch_size, args.fp16, args.bf16 + ) + else: + if self.ref_model is None: + if not self.is_peft_model: + raise ValueError("No reference model and model is not a Peft model.") + else: + self.ref_model = self.ref_model.to(self.accelerator.device) + self.reward_model = self.reward_model.to(self.accelerator.device) + + def get_train_dataloader(self) -> DataLoader: + return self.dataloader + + def get_eval_dataloader(self) -> DataLoader: + return self.eval_dataloader + + @contextmanager + def null_ref_context(self): + """Context manager for handling null reference model (that is, peft adapter manipulation).""" + with ( + self.accelerator.unwrap_model(self.model.policy).disable_adapter() + if self.is_peft_model and not self.ref_adapter_name + else nullcontext() + ): + if self.ref_adapter_name: + self.model.policy.set_adapter(self.ref_adapter_name) + yield + if self.ref_adapter_name: + self.model.policy.set_adapter(self.model_adapter_name or "default") + + def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False): + backup_model = self.model + self.model = self.model.policy # save only the policy + + if self.is_deepspeed_enabled: + backup_deepspeed = self.deepspeed + self.deepspeed = self.model + + super().save_model(output_dir, _internal_call) + + self.model = backup_model + + if self.is_deepspeed_enabled: + self.deepspeed = backup_deepspeed + + def train(self): + args = self.args + accelerator = self.accelerator + optimizer = self.optimizer + model = self.model + ref_policy = self.ref_model + reward_model = self.reward_model + processing_class = self.processing_class + dataloader = self.dataloader + device = accelerator.device + + def repeat_generator(): + while True: + yield from dataloader + + iter_dataloader = iter(repeat_generator()) + generation_config = GenerationConfig( + max_new_tokens=args.response_length, + temperature=(args.temperature + 1e-7), + top_k=0.0, + top_p=1.0, + do_sample=True, + ) + + accelerator.print("===training policy===") + start_time = time.time() + stats_shape = (args.num_ppo_epochs, args.num_mini_batches, args.gradient_accumulation_steps) + approxkl_stats = torch.zeros(stats_shape, device=device) + pg_clipfrac_stats = torch.zeros(stats_shape, device=device) + pg_loss_stats = torch.zeros(stats_shape, device=device) + vf_loss_stats = torch.zeros(stats_shape, device=device) + vf_clipfrac_stats = torch.zeros(stats_shape, device=device) + entropy_stats = torch.zeros(stats_shape, device=device) + ratio_stats = torch.zeros(stats_shape, device=device) + model.train() + + # trainer state initialization + self.state.global_step = 0 + self.state.episode = 0 + self.state.max_steps = args.num_total_batches + self.state.num_train_epochs = args.total_episodes / self.train_dataset_len + # Compute absolute values for logging, eval, and save if given as ratio + if args.logging_steps is not None: + if args.logging_steps < 1: + self.state.logging_steps = math.ceil(self.state.max_steps * args.logging_steps) + else: + self.state.logging_steps = args.logging_steps + if args.eval_steps is not None: + if args.eval_steps < 1: + self.state.eval_steps = math.ceil(self.state.max_steps * args.eval_steps) + else: + self.state.eval_steps = args.eval_steps + if args.save_steps is not None: + if args.save_steps < 1: + self.state.save_steps = math.ceil(self.state.max_steps * args.save_steps) + else: + self.state.save_steps = args.save_steps + self.control = self.callback_handler.on_train_begin(args, self.state, self.control) + + # backward compatibility + if self.is_deepspeed_enabled: + self.deepspeed = self.model + self.model_wrapped = self.model + + for update in range(1, args.num_total_batches + 1): + self.state.episode += 1 * args.batch_size + data = next(iter_dataloader) + with torch.no_grad(): + queries = data["input_ids"].to(device) + context_length = queries.shape[1] + responses = [] + postprocessed_responses = [] + logprobs = [] + ref_logprobs = [] + scores = [] + sequence_lengths = [] + values = [] + with unwrap_model_for_generation( + self.model, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation + ) as unwrapped_model: + query_responses, logitss = batch_generation( + unwrapped_model.policy, + queries, + args.local_rollout_forward_batch_size, + processing_class.pad_token_id, + generation_config, + ) + + for i in range(0, queries.shape[0], args.local_rollout_forward_batch_size): + query = queries[i : i + args.local_rollout_forward_batch_size] + query_response = query_responses[i : i + args.local_rollout_forward_batch_size] + response = query_response[:, context_length:] + logits = logitss[i : i + args.local_rollout_forward_batch_size] + logprob = selective_log_softmax(logits, response) + del logits + empty_cache() + + if ref_policy is None: + with self.null_ref_context(): + ref_output = forward(model.policy, query_response, processing_class.pad_token_id) + else: + ref_output = forward(ref_policy, query_response, processing_class.pad_token_id) + ref_logits = ref_output.logits[:, context_length - 1 : -1] + ref_logits /= args.temperature + 1e-7 + ref_logprob = selective_log_softmax(ref_logits, response) + del ref_output, ref_logits + empty_cache() + + # Response Processing 1. truncate response after the first occurrence of `stop_token_id` + postprocessed_response = response + if self.stop_token_id is not None: # handle the edge case when stop_token_id exists but is 0 + postprocessed_response = truncate_response( + self.stop_token_id, processing_class.pad_token_id, response + ) + + # Response Processing 2. run reward model on the truncated responses + postprocessed_query_response = torch.cat((query, postprocessed_response), 1) + sequence_length = first_true_indices(postprocessed_response == processing_class.pad_token_id) - 1 + unwrapped_value_model = accelerator.unwrap_model(model).value_model + full_value, _, _ = get_reward( + unwrapped_value_model, query_response, processing_class.pad_token_id, context_length + ) + value = full_value[:, context_length - 1 : -1].squeeze(-1) + _, score, _ = get_reward( + reward_model, postprocessed_query_response, processing_class.pad_token_id, context_length + ) + + responses.append(response) + postprocessed_responses.append(postprocessed_response) + logprobs.append(logprob) + ref_logprobs.append(ref_logprob) + sequence_lengths.append(sequence_length) + scores.append(score) + values.append(value) + responses = torch.cat(responses, 0) + postprocessed_responses = torch.cat(postprocessed_responses, 0) + logprobs = torch.cat(logprobs, 0) + ref_logprobs = torch.cat(ref_logprobs, 0) + sequence_lengths = torch.cat(sequence_lengths, 0) + scores = torch.cat(scores, 0) + values = torch.cat(values, 0) + del (logprob, ref_logprob, full_value, value, score, unwrapped_model) + empty_cache() + gc.collect() + + # Response Processing 3. Filter completion. Ensure that the sample contains stop_token_id + # Completions not passing that filter will receive a lower score. + contain_eos_token = torch.any(postprocessed_responses == self.processing_class.eos_token_id, dim=-1) + if self.args.missing_eos_penalty is not None: + scores[~contain_eos_token] -= self.args.missing_eos_penalty + # accelerator.print(f"{scores=}, {(contain_eos_token.sum() / len(contain_eos_token))=}") + + # be very careful with `padding_mask_p1`; see https://excalidraw.com/#json=LWnzG4w2k5DjF_EOL_xPt,e2w3a-hFJ_gX5vOfeyXGTw + response_idxs = torch.arange(responses.shape[1], device=responses.device).repeat(responses.shape[0], 1) + padding_mask = response_idxs > sequence_lengths.unsqueeze(1) + logprobs = torch.masked_fill(logprobs, padding_mask, INVALID_LOGPROB) + ref_logprobs = torch.masked_fill(ref_logprobs, padding_mask, INVALID_LOGPROB) + sequence_lengths_p1 = sequence_lengths + 1 + padding_mask_p1 = response_idxs > (sequence_lengths_p1.unsqueeze(1)) + values = torch.masked_fill(values, padding_mask_p1, 0) + + # 4. compute rewards + # Formula used by http://joschu.net/blog/kl-approx.html for the k1 and k3 estimators + logr = ref_logprobs - logprobs + kl = -logr if args.kl_estimator == "k1" else (logr.exp() - 1) - logr # Else statement is k3 + non_score_reward = -args.kl_coef * kl + rewards = non_score_reward.clone() + actual_start = torch.arange(rewards.size(0), device=rewards.device) + actual_end = torch.where(sequence_lengths_p1 < rewards.size(1), sequence_lengths_p1, sequence_lengths) + rewards[[actual_start, actual_end]] += scores + + # 5. whiten rewards + if args.whiten_rewards: + rewards = masked_whiten(rewards, mask=~padding_mask_p1, shift_mean=False) + rewards = torch.masked_fill(rewards, padding_mask_p1, 0) + + # 6. compute advantages and returns + lastgaelam = 0 + advantages_reversed = [] + gen_length = responses.shape[1] + for t in reversed(range(gen_length)): + nextvalues = values[:, t + 1] if t < gen_length - 1 else 0.0 + delta = rewards[:, t] + args.gamma * nextvalues - values[:, t] + lastgaelam = delta + args.gamma * args.lam * lastgaelam + advantages_reversed.append(lastgaelam) + advantages = torch.stack(advantages_reversed[::-1], axis=1) + returns = advantages + values + advantages = masked_whiten(advantages, ~padding_mask) + advantages = torch.masked_fill(advantages, padding_mask, 0) + empty_cache() + + # Do multiple epochs of PPO training, with a fresh random shuffle in each epoch + for ppo_epoch_idx in range(args.num_ppo_epochs): + b_inds = np.random.permutation(args.local_batch_size) + minibatch_idx = 0 + for mini_batch_start in range(0, args.local_batch_size, args.local_mini_batch_size): + mini_batch_end = mini_batch_start + args.local_mini_batch_size + mini_batch_inds = b_inds[mini_batch_start:mini_batch_end] + gradient_accumulation_idx = 0 + for micro_batch_start in range(0, args.local_mini_batch_size, args.per_device_train_batch_size): + with accelerator.accumulate(model): + micro_batch_end = micro_batch_start + args.per_device_train_batch_size + micro_batch_inds = mini_batch_inds[micro_batch_start:micro_batch_end] + mb_advantage = advantages[micro_batch_inds] + mb_responses = responses[micro_batch_inds] + mb_query_responses = query_responses[micro_batch_inds] + mb_logprobs = logprobs[micro_batch_inds] + mb_return = returns[micro_batch_inds] + mb_values = values[micro_batch_inds] + + output, vpred_temp = forward(model, mb_query_responses, processing_class.pad_token_id) + logits = output.logits[:, context_length - 1 : -1] + logits /= args.temperature + 1e-7 + new_logprobs = selective_log_softmax(logits, mb_responses) + new_logprobs = torch.masked_fill( + new_logprobs, padding_mask[micro_batch_inds], INVALID_LOGPROB + ) + vpred = vpred_temp[:, context_length - 1 : -1].squeeze(-1) + vpred = torch.masked_fill(vpred, padding_mask_p1[micro_batch_inds], 0) + vpredclipped = torch.clamp( + vpred, + mb_values - args.cliprange_value, + mb_values + args.cliprange_value, + ) + vf_losses1 = torch.square(vpred - mb_return) + vf_losses2 = torch.square(vpredclipped - mb_return) + vf_loss_max = torch.max(vf_losses1, vf_losses2) + vf_loss = 0.5 * masked_mean(vf_loss_max, ~padding_mask_p1[micro_batch_inds]) + vf_clipfrac = masked_mean( + (vf_losses2 > vf_losses1).float(), ~padding_mask_p1[micro_batch_inds] + ) + logprobs_diff = new_logprobs - mb_logprobs + ratio = torch.exp(logprobs_diff) + pg_losses = -mb_advantage * ratio + pg_losses2 = -mb_advantage * torch.clamp(ratio, 1.0 - args.cliprange, 1.0 + args.cliprange) + pg_loss_max = torch.max(pg_losses, pg_losses2) + pg_loss = masked_mean(pg_loss_max, ~padding_mask[micro_batch_inds]) + loss = pg_loss + args.vf_coef * vf_loss + accelerator.backward(loss) + optimizer.step() + optimizer.zero_grad() + with torch.no_grad(): + pg_clipfrac = masked_mean( + (pg_losses2 > pg_losses).float(), ~padding_mask[micro_batch_inds] + ) + prob_dist = torch.nn.functional.softmax(logits, dim=-1) + entropy = torch.logsumexp(logits, dim=-1) - torch.sum(prob_dist * logits, dim=-1) + approxkl = 0.5 * (logprobs_diff**2).mean() + approxkl_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = approxkl + pg_clipfrac_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = ( + pg_clipfrac + ) + pg_loss_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = pg_loss + vf_loss_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = vf_loss + vf_clipfrac_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = ( + vf_clipfrac + ) + entropy_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = entropy.mean() + ratio_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = ratio.mean() + gradient_accumulation_idx += 1 + minibatch_idx += 1 + # del everything and empty cache + # fmt: off + del ( + output, vpred_temp, logits, new_logprobs, vpred, vpredclipped, + vf_losses1, vf_losses2, vf_loss, vf_clipfrac, logprobs_diff, ratio, pg_losses, pg_losses2, pg_loss_max, + pg_loss, loss, pg_clipfrac, prob_dist, entropy, approxkl, mb_return, + mb_advantage, mb_values, mb_responses, mb_query_responses, mb_logprobs, + ) + # fmt: on + empty_cache() + with torch.no_grad(): + mean_kl = kl.sum(1).mean() + mean_entropy = (-logprobs).sum(1).mean() + mean_non_score_reward = non_score_reward.sum(1).mean() + rlhf_reward = mean_non_score_reward + scores.mean() + eps = int(self.state.episode / (time.time() - start_time)) + metrics = {} + metrics["eps"] = eps + metrics["objective/kl"] = self.accelerator.gather_for_metrics(mean_kl).mean().item() + metrics["objective/entropy"] = self.accelerator.gather_for_metrics(mean_entropy).mean().item() + metrics["objective/non_score_reward"] = ( + self.accelerator.gather_for_metrics(mean_non_score_reward).mean().item() + ) + metrics["objective/rlhf_reward"] = self.accelerator.gather_for_metrics(rlhf_reward).mean().item() + metrics["objective/scores"] = self.accelerator.gather_for_metrics(scores.mean()).mean().item() + metrics["policy/approxkl_avg"] = self.accelerator.gather_for_metrics(approxkl_stats).mean().item() + metrics["policy/clipfrac_avg"] = self.accelerator.gather_for_metrics(pg_clipfrac_stats).mean().item() + metrics["loss/policy_avg"] = self.accelerator.gather_for_metrics(pg_loss_stats).mean().item() + metrics["loss/value_avg"] = self.accelerator.gather_for_metrics(vf_loss_stats).mean().item() + metrics["val/clipfrac_avg"] = self.accelerator.gather_for_metrics(vf_clipfrac_stats).mean().item() + metrics["policy/entropy_avg"] = self.accelerator.gather_for_metrics(entropy_stats).mean().item() + metrics["val/ratio"] = self.accelerator.gather_for_metrics(ratio_stats).mean().item() + metrics["val/ratio_var"] = self.accelerator.gather_for_metrics(ratio_stats).var().item() + metrics["val/num_eos_tokens"] = (responses == processing_class.eos_token_id).sum().item() + metrics["lr"] = self.lr_scheduler.get_last_lr()[0] + metrics["episode"] = self.state.episode + self.state.epoch = self.state.episode / self.train_dataset_len # used by self.log + self.state.global_step += 1 + self.log(metrics) + + self.lr_scheduler.step() + self.control = self.callback_handler.on_step_end(args, self.state, self.control) + if self.control.should_save: + self._save_checkpoint(model, trial=None) + self.control = self.callback_handler.on_save(self.args, self.state, self.control) + del kl, mean_kl, mean_entropy, mean_non_score_reward, scores, metrics, non_score_reward + empty_cache() + gc.collect() + + if args.num_sample_generations > 0 and (update - 1) % self.sample_generations_freq == 0: + self.generate_completions(sampling=True) + empty_cache() + del ( + query_responses, + responses, + postprocessed_responses, + logprobs, + ref_logprobs, + values, + sequence_lengths, + contain_eos_token, + sequence_lengths_p1, + response_idxs, + padding_mask, + padding_mask_p1, + rewards, + actual_start, + actual_end, + advantages, + returns, + ) + empty_cache() + + # HF trainer specifics + self.control = self.callback_handler.on_train_end(args, self.state, self.control) + if self.control.should_save: + self._save_checkpoint(model, trial=None, metrics=None) + self.control = self.callback_handler.on_save(self.args, self.state, self.control) + + def generate_completions(self, sampling: bool = False): + args = self.args + processing_class = self.processing_class + generation_config = GenerationConfig( + max_new_tokens=self.args.response_length, + temperature=(0.01 + 1e-7), + top_k=0.0, + top_p=1.0, + do_sample=True, + ) + + table = defaultdict(list) + with unwrap_model_for_generation( + self.model, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation + ) as unwrapped_model: + for batch in self.eval_dataloader: + query = batch["input_ids"] + with torch.no_grad(): + context_length = query.shape[1] + query_response, _ = batch_generation( + unwrapped_model.policy, + query, + query.shape[0], + processing_class.pad_token_id, + generation_config, + ) + response = query_response[:, context_length:] + postprocessed_response = response + if self.stop_token_id is not None: # handle the edge case when stop_token_id exists but is 0 + postprocessed_response = truncate_response( + self.stop_token_id, processing_class.pad_token_id, response + ) + table["query"].extend( + gather_object(processing_class.batch_decode(query, skip_special_tokens=True)) + ) + table["model response"].extend( + gather_object(processing_class.batch_decode(postprocessed_response)) + ) + + postprocessed_query_response = torch.cat((query, postprocessed_response), 1) + _, score, _ = get_reward( + self.reward_model, postprocessed_query_response, processing_class.pad_token_id, context_length + ) + table["score"].extend(self.accelerator.gather_for_metrics(score).float().cpu().numpy()) + + if sampling: + break + df = pd.DataFrame(table) + + if self.accelerator.is_main_process: + if is_rich_available(): + print_rich_table(df.iloc[0 : 0 + 5]) + if "wandb" in args.report_to: + import wandb + + if wandb.run is not None: + wandb.log({"completions": wandb.Table(dataframe=df)}) + + if "comet_ml" in args.report_to: + log_table_to_comet_experiment( + name="completions.csv", + table=df, + ) + + # Ensure the model card is saved along with the checkpoint + def _save_checkpoint(self, model, trial): + if self.args.hub_model_id is None: + model_name = Path(self.args.output_dir).name + else: + model_name = self.args.hub_model_id.split("/")[-1] + self.create_model_card(model_name=model_name) + super()._save_checkpoint(model, trial) + + def create_model_card( + self, + model_name: Optional[str] = None, + dataset_name: Optional[str] = None, + tags: Union[str, list[str], None] = None, + ): + """ + Creates a draft of a model card using the information available to the `Trainer`. + + Args: + model_name (`str` or `None`, *optional*, defaults to `None`): + Name of the model. + dataset_name (`str` or `None`, *optional*, defaults to `None`): + Name of the dataset used for training. + tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`): + Tags to be associated with the model card. + """ + if not self.is_world_process_zero(): + return + + if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path): + base_model = self.model.config._name_or_path + else: + base_model = None + + # normalize `tags` to a mutable set + if tags is None: + tags = set() + elif isinstance(tags, str): + tags = {tags} + else: + tags = set(tags) + + if hasattr(self.model.config, "unsloth_version"): + tags.add("unsloth") + + tags.update(self._tag_names) + + citation = textwrap.dedent("""\ + @article{mziegler2019fine-tuning, + title = {{Fine-Tuning Language Models from Human Preferences}}, + author = {Daniel M. Ziegler and Nisan Stiennon and Jeffrey Wu and Tom B. Brown and Alec Radford and Dario Amodei and Paul F. Christiano and Geoffrey Irving}, + year = 2019, + eprint = {arXiv:1909.08593} + }""") + + model_card = generate_model_card( + base_model=base_model, + model_name=model_name, + hub_model_id=self.hub_model_id, + dataset_name=dataset_name, + tags=tags, + wandb_url=wandb.run.get_url() if is_wandb_available() and wandb.run is not None else None, + comet_url=get_comet_experiment_url(), + trainer_name="PPO", + trainer_citation=citation, + paper_title="Fine-Tuning Language Models from Human Preferences", + paper_id="1909.08593", + ) + + model_card.save(os.path.join(self.args.output_dir, "README.md")) +class UnslothPPOTrainer(_UnslothPPOTrainer): + """ + + """ + def __init__( + self, + args, + processing_class, + model, + ref_model, + reward_model, + train_dataset, + value_model, + data_collator = None, + eval_dataset = None, + callbacks = None, + peft_config = None, + **kwargs + ): + if args is None: args = UnslothPPOConfig() + use_bf16 = getattr(args, 'bf16', False) + if type(use_bf16) is not bool: use_bf16 = False + use_fp16 = getattr(args, 'fp16', False) + if type(use_fp16) is not bool: use_fp16 = False + force_float32 = False + if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1': + print('Unsloth: Switching to float32 training since model cannot work with float16') + force_float32 = True + mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') + dtype = getattr(model.config, 'torch_dtype', None) + if dtype is None: dtype = model.get_input_embeddings().dtype + from unsloth_zoo.utils import _get_dtype + dtype = _get_dtype(dtype) + float16 = dtype == torch.float16 + if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`') + if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`') + if force_float32: + args.fp16 = False + args.bf16 = False + os.environ['ACCELERATE_MIXED_PRECISION'] = 'no' + elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32': + args.fp16 = float16 + args.bf16 = not float16 + os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16' + if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no': + args.eval_strategy = 'steps' + if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1 + ga_steps = getattr(args, 'gradient_accumulation_steps', None) + if ga_steps is not None and ga_steps > 1: + from transformers import __version__ as transformers_version + if Version(transformers_version) <= Version('4.45.2'): + print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n' + '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`') + if getattr(args, 'eval_strategy', 'no') != 'no': + eval_bsz = getattr(args, 'per_device_eval_batch_size', 8) + if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size + if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps + fp16_full_eval = getattr(args, 'fp16_full_eval', False) + if type(fp16_full_eval) is not bool: fp16_full_eval = False + bf16_full_eval = getattr(args, 'bf16_full_eval', False) + if type(bf16_full_eval) is not bool: bf16_full_eval = False + if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True + if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False + if force_float32: + args.bf16_full_eval = False + args.fp16_full_eval = False + elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16': + args.bf16_full_eval = True + args.fp16_full_eval = False + elif not bf16_full_eval and not fp16_full_eval: + args.bf16_full_eval = args.bf16 + args.fp16_full_eval = args.fp16 + _output_logits = False + if locals().get('compute_metrics', None) is not None: _output_logits = True + if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True + if _output_logits: + os.environ['UNSLOTH_RETURN_LOGITS'] = '1' + if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'): + pass + else: + model_max_seq_length = getattr(model, 'max_seq_length', None) + args_max_seq_length = getattr(args, 'max_seq_length', None) + if args_max_seq_length is None and model_max_seq_length is not None: + max_seq_length = model.max_seq_length + if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length + if model is not None and hasattr(model, 'for_training'): + model.for_training() + if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right' + if 'processing_class' in locals(): + if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right' + if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right' + __tokenizer = processing_class if 'processing_class' in locals() else tokenizer + from unsloth_zoo.vision_utils import UnslothVisionDataCollator + if not isinstance(data_collator, UnslothVisionDataCollator): + if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer, mlm = False, mlm_probability = 0.0) + elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names: + data_collator = DataCollatorForSeq2Seq(__tokenizer) + else: + if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False + if hasattr(args, 'dataset_text_field'): args.dataset_text_field = '' + if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True} + if not isinstance(data_collator, UnslothVisionDataCollator): + if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'): + if isinstance(data_collator, DataCollatorForSeq2Seq): + data_collator = DataCollatorForSeq2Seq(__tokenizer.tokenizer) + else: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer.tokenizer, mlm = False, mlm_probability = 0.0) + other_metrics = [] + + from unsloth_zoo.logging_utils import PatchRLStatistics + PatchRLStatistics('ppo_trainer', other_metrics) + + super().__init__( + args = args, + processing_class = processing_class, + model = model, + ref_model = ref_model, + reward_model = reward_model, + train_dataset = train_dataset, + value_model = value_model, + data_collator = data_collator, + eval_dataset = eval_dataset, + callbacks = callbacks, + peft_config = peft_config,**kwargs) + if hasattr(self, 'neftune_hook_handle'): + self.neftune_hook_handle.remove() + if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle + if getattr(args, 'neftune_noise_alpha', None) is not None: + model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha + pass + +pass diff --git a/unsloth_compiled_cache/UnslothPRMTrainer.py b/unsloth_compiled_cache/UnslothPRMTrainer.py new file mode 100644 index 0000000000000000000000000000000000000000..b8195131b2cf39a580a8222ce22a9cfb57fc50a6 --- /dev/null +++ b/unsloth_compiled_cache/UnslothPRMTrainer.py @@ -0,0 +1,826 @@ +""" +2025.7.4 +2025.7.3 +4.53.2 +0.19.1 +__UNSLOTH_VERSIONING__ +""" +from torch import Tensor +import torch +import torch.nn as nn +from torch.nn import functional as F +from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable +from trl.trainer.prm_trainer import (BaseImageProcessor, Callable, DataCollator, DataCollatorForTokenClassification, Dataset, EvalPrediction, FeatureExtractionMixin, Optional, PRMConfig, PRMTrainer, PartialState, Path, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, Trainer, TrainerCallback, Union, chain, compute_accuracy, disable_dropout_in_model, features, generate_model_card, inspect, is_peft_available, is_wandb_available, nn, os, prepare_model_for_kbit_training, textwrap, torch, wandb, warnings, Optional, PeftModel, PreTrainedModel, Trainer, is_peft_available, os, torch) + + +import os +from typing import * +from dataclasses import dataclass, field +from packaging.version import Version +import torch +import numpy as np +from contextlib import nullcontext +from torch.nn import functional as F +from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling + +torch_compile_options = { + "epilogue_fusion" : True, + "max_autotune" : False, + "shape_padding" : True, + "trace.enabled" : False, + "triton.cudagraphs" : False, +} + +@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) +def selective_log_softmax(logits, index): + logits = logits.to(torch.float32) + selected_logits = torch.gather(logits, dim = -1, index = index.unsqueeze(-1)).squeeze(-1) + # loop to reduce peak mem consumption + # logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits]) + logsumexp_values = torch.logsumexp(logits, dim = -1) + per_token_logps = selected_logits - logsumexp_values # log_softmax(x_i) = x_i - logsumexp(x) + return per_token_logps +@dataclass +class UnslothPRMConfig(PRMConfig): + """ + + Configuration class for the [`PRMTrainer`]. + + This class includes only the parameters that are specific to PRM training. For a full list of training arguments, + please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may + differ from those in [`~transformers.TrainingArguments`]. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + max_length (`int` or `None`, *optional*, defaults to `1024`): + Maximum length of the sequences (prompt + completion) used for truncation. + max_prompt_length (`int` or `None`, *optional*, defaults to `512`): + Maximum length of the prompt used for truncation. + max_completion_length (`int` or `None`, *optional*, defaults to `None`): + Maximum length of the completion used for truncation. The completion is the concatenation of the steps. + disable_dropout (`bool`, *optional*, defaults to `True`): + Whether to disable dropout in the model. + step_separator (`str`, *optional*, defaults to `"\n"`): + Separator used to separate each step of the reasoning process. + train_on_last_step_only (`bool`, *optional*, defaults to `False`): + Whether to train only on the last step. + dataset_num_proc (`int`, *optional*, defaults to `None`): + Number of processes to use for processing the dataset. + + """ + vllm_sampling_params: Optional[Any] = field( + default = None, + metadata = {'help': 'vLLM SamplingParams'}, + ) + unsloth_num_chunks : Optional[int] = field( + default = -1, + metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'}, + ) + def __init__( + self, + output_dir = None, + overwrite_output_dir = None, + do_train = False, + do_eval = False, + do_predict = False, + eval_strategy = 'no', + prediction_loss_only = False, + per_device_train_batch_size = 4, + per_device_eval_batch_size = 4, + per_gpu_train_batch_size = None, + per_gpu_eval_batch_size = None, + gradient_accumulation_steps = 2, + eval_accumulation_steps = 2, + eval_delay = 0, + torch_empty_cache_steps = 250, + learning_rate = 5e-05, + weight_decay = 0.01, + adam_beta1 = 0.9, + adam_beta2 = 0.999, + adam_epsilon = 1e-08, + max_grad_norm = 1.0, + num_train_epochs = 3.0, + max_steps = -1, + lr_scheduler_type = 'linear', + warmup_ratio = 0.1, + warmup_steps = 0, + log_level = 'passive', + log_level_replica = 'warning', + log_on_each_node = True, + logging_dir = None, + logging_strategy = 'steps', + logging_first_step = False, + logging_steps = 1, + logging_nan_inf_filter = False, + save_strategy = 'steps', + save_steps = 500, + save_total_limit = None, + save_safetensors = True, + save_on_each_node = False, + save_only_model = False, + restore_callback_states_from_checkpoint = False, + no_cuda = False, + use_cpu = False, + use_mps_device = False, + seed = 3407, + data_seed = 3407, + jit_mode_eval = False, + use_ipex = False, + bf16 = False, + fp16 = False, + fp16_opt_level = 'O1', + half_precision_backend = 'auto', + bf16_full_eval = False, + fp16_full_eval = False, + tf32 = None, + local_rank = -1, + ddp_backend = None, + tpu_num_cores = None, + tpu_metrics_debug = False, + debug = '', + dataloader_drop_last = False, + eval_steps = None, + dataloader_num_workers = 0, + dataloader_prefetch_factor = None, + past_index = -1, + run_name = None, + disable_tqdm = None, + remove_unused_columns = True, + label_names = None, + load_best_model_at_end = False, + metric_for_best_model = None, + greater_is_better = None, + ignore_data_skip = False, + fsdp = '', + fsdp_min_num_params = 0, + fsdp_config = None, + fsdp_transformer_layer_cls_to_wrap = None, + accelerator_config = None, + deepspeed = None, + label_smoothing_factor = 0.0, + optim = 'adamw_8bit', + optim_args = None, + adafactor = False, + group_by_length = False, + length_column_name = 'length', + report_to = None, + ddp_find_unused_parameters = None, + ddp_bucket_cap_mb = None, + ddp_broadcast_buffers = None, + dataloader_pin_memory = True, + dataloader_persistent_workers = False, + skip_memory_metrics = True, + use_legacy_prediction_loop = False, + push_to_hub = False, + resume_from_checkpoint = None, + hub_model_id = None, + hub_strategy = 'every_save', + hub_token = None, + hub_private_repo = None, + hub_always_push = False, + hub_revision = None, + gradient_checkpointing = False, + gradient_checkpointing_kwargs = None, + include_inputs_for_metrics = False, + eval_do_concat_batches = True, + fp16_backend = 'auto', + push_to_hub_model_id = None, + push_to_hub_organization = None, + push_to_hub_token = None, + mp_parameters = '', + auto_find_batch_size = False, + full_determinism = False, + torchdynamo = None, + ray_scope = 'last', + ddp_timeout = 1800, + torch_compile = False, + torch_compile_backend = None, + torch_compile_mode = None, + include_tokens_per_second = False, + include_num_input_tokens_seen = False, + neftune_noise_alpha = None, + optim_target_modules = None, + batch_eval_metrics = False, + eval_on_start = False, + use_liger_kernel = False, + liger_kernel_config = None, + eval_use_gather_object = False, + average_tokens_across_devices = True, + max_length = 1024, + max_prompt_length = 512, + max_completion_length = None, + disable_dropout = True, + step_separator = '\ +', + train_on_last_step_only = False, + dataset_num_proc = None, + vllm_sampling_params = None, + unsloth_num_chunks = -1, + **kwargs, + ): + if learning_rate < 1e-7: raise FloatingPointError(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!') + if learning_rate > 1: raise OverflowError(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!') + if output_dir is None and save_strategy == 'steps' and save_steps == 500: + output_dir = 'unsloth_training_checkpoints' + save_strategy = 'no' + if dataset_num_proc is None: + from multiprocessing import cpu_count + dataset_num_proc = cpu_count() + + super().__init__( + output_dir = output_dir, + overwrite_output_dir = overwrite_output_dir, + do_train = do_train, + do_eval = do_eval, + do_predict = do_predict, + eval_strategy = eval_strategy, + prediction_loss_only = prediction_loss_only, + per_device_train_batch_size = per_device_train_batch_size, + per_device_eval_batch_size = per_device_eval_batch_size, + per_gpu_train_batch_size = per_gpu_train_batch_size, + per_gpu_eval_batch_size = per_gpu_eval_batch_size, + gradient_accumulation_steps = gradient_accumulation_steps, + eval_accumulation_steps = eval_accumulation_steps, + eval_delay = eval_delay, + torch_empty_cache_steps = torch_empty_cache_steps, + learning_rate = learning_rate, + weight_decay = weight_decay, + adam_beta1 = adam_beta1, + adam_beta2 = adam_beta2, + adam_epsilon = adam_epsilon, + max_grad_norm = max_grad_norm, + num_train_epochs = num_train_epochs, + max_steps = max_steps, + lr_scheduler_type = lr_scheduler_type, + warmup_ratio = warmup_ratio, + warmup_steps = warmup_steps, + log_level = log_level, + log_level_replica = log_level_replica, + log_on_each_node = log_on_each_node, + logging_dir = logging_dir, + logging_strategy = logging_strategy, + logging_first_step = logging_first_step, + logging_steps = logging_steps, + logging_nan_inf_filter = logging_nan_inf_filter, + save_strategy = save_strategy, + save_steps = save_steps, + save_total_limit = save_total_limit, + save_safetensors = save_safetensors, + save_on_each_node = save_on_each_node, + save_only_model = save_only_model, + restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint, + no_cuda = no_cuda, + use_cpu = use_cpu, + use_mps_device = use_mps_device, + seed = seed, + data_seed = data_seed, + jit_mode_eval = jit_mode_eval, + use_ipex = use_ipex, + bf16 = bf16, + fp16 = fp16, + fp16_opt_level = fp16_opt_level, + half_precision_backend = half_precision_backend, + bf16_full_eval = bf16_full_eval, + fp16_full_eval = fp16_full_eval, + tf32 = tf32, + local_rank = local_rank, + ddp_backend = ddp_backend, + tpu_num_cores = tpu_num_cores, + tpu_metrics_debug = tpu_metrics_debug, + debug = debug, + dataloader_drop_last = dataloader_drop_last, + eval_steps = eval_steps, + dataloader_num_workers = dataloader_num_workers, + dataloader_prefetch_factor = dataloader_prefetch_factor, + past_index = past_index, + run_name = run_name, + disable_tqdm = disable_tqdm, + remove_unused_columns = remove_unused_columns, + label_names = label_names, + load_best_model_at_end = load_best_model_at_end, + metric_for_best_model = metric_for_best_model, + greater_is_better = greater_is_better, + ignore_data_skip = ignore_data_skip, + fsdp = fsdp, + fsdp_min_num_params = fsdp_min_num_params, + fsdp_config = fsdp_config, + fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap, + accelerator_config = accelerator_config, + deepspeed = deepspeed, + label_smoothing_factor = label_smoothing_factor, + optim = optim, + optim_args = optim_args, + adafactor = adafactor, + group_by_length = group_by_length, + length_column_name = length_column_name, + report_to = report_to, + ddp_find_unused_parameters = ddp_find_unused_parameters, + ddp_bucket_cap_mb = ddp_bucket_cap_mb, + ddp_broadcast_buffers = ddp_broadcast_buffers, + dataloader_pin_memory = dataloader_pin_memory, + dataloader_persistent_workers = dataloader_persistent_workers, + skip_memory_metrics = skip_memory_metrics, + use_legacy_prediction_loop = use_legacy_prediction_loop, + push_to_hub = push_to_hub, + resume_from_checkpoint = resume_from_checkpoint, + hub_model_id = hub_model_id, + hub_strategy = hub_strategy, + hub_token = hub_token, + hub_private_repo = hub_private_repo, + hub_always_push = hub_always_push, + hub_revision = hub_revision, + gradient_checkpointing = gradient_checkpointing, + gradient_checkpointing_kwargs = gradient_checkpointing_kwargs, + include_inputs_for_metrics = include_inputs_for_metrics, + eval_do_concat_batches = eval_do_concat_batches, + fp16_backend = fp16_backend, + push_to_hub_model_id = push_to_hub_model_id, + push_to_hub_organization = push_to_hub_organization, + push_to_hub_token = push_to_hub_token, + mp_parameters = mp_parameters, + auto_find_batch_size = auto_find_batch_size, + full_determinism = full_determinism, + torchdynamo = torchdynamo, + ray_scope = ray_scope, + ddp_timeout = ddp_timeout, + torch_compile = torch_compile, + torch_compile_backend = torch_compile_backend, + torch_compile_mode = torch_compile_mode, + include_tokens_per_second = include_tokens_per_second, + include_num_input_tokens_seen = include_num_input_tokens_seen, + neftune_noise_alpha = neftune_noise_alpha, + optim_target_modules = optim_target_modules, + batch_eval_metrics = batch_eval_metrics, + eval_on_start = eval_on_start, + use_liger_kernel = use_liger_kernel, + liger_kernel_config = liger_kernel_config, + eval_use_gather_object = eval_use_gather_object, + average_tokens_across_devices = average_tokens_across_devices, + max_length = max_length, + max_prompt_length = max_prompt_length, + max_completion_length = max_completion_length, + disable_dropout = disable_dropout, + step_separator = step_separator, + train_on_last_step_only = train_on_last_step_only, + dataset_num_proc = dataset_num_proc,**kwargs) + self.vllm_sampling_params = vllm_sampling_params + self.unsloth_num_chunks = unsloth_num_chunks +pass + +class _UnslothPRMTrainer(Trainer): + """""" + + _tag_names = ["trl", "prm"] + + def __init__( + self, + model: Optional[Union[PreTrainedModel, nn.Module]] = None, + args: Optional[PRMConfig] = None, + data_collator: Optional[DataCollator] = None, + train_dataset: Optional[Dataset] = None, + eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None, + processing_class: Optional[ + Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin] + ] = None, + model_init: Optional[Callable[[], PreTrainedModel]] = None, + compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None, + callbacks: Optional[list[TrainerCallback]] = None, + optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = ( + None, + None, + ), + preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, + peft_config: Optional[dict] = None, + ): + if not is_peft_available() and peft_config is not None: + raise ValueError( + "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models" + ) + elif is_peft_available() and peft_config is not None: + if not isinstance(model, PeftModel): + if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_quantized", False): + _supports_gc_kwargs = "gradient_checkpointing_kwargs" in list( + inspect.signature(prepare_model_for_kbit_training).parameters + ) + + prepare_model_kwargs = {"use_gradient_checkpointing": args.gradient_checkpointing} + + if not _supports_gc_kwargs and args.gradient_checkpointing_kwargs is not None: + warnings.warn( + "You passed `gradient_checkpointing_kwargs` in the trainer's kwargs, but your peft version does not support it. " + "please update to the latest version of peft to use `gradient_checkpointing_kwargs`." + ) + elif _supports_gc_kwargs and args.gradient_checkpointing_kwargs is not None: + prepare_model_kwargs["gradient_checkpointing_kwargs"] = args.gradient_checkpointing_kwargs + + model = prepare_model_for_kbit_training(model, **prepare_model_kwargs) + + model = model + + # Disable dropout in the model + if args.disable_dropout: + disable_dropout_in_model(model) + + if compute_metrics is None: + compute_metrics = compute_accuracy + + if data_collator is None: + if processing_class is None: + raise ValueError( + "A processing_class must be specified when using the default DataCollatorForTokenClassification" + ) + data_collator = DataCollatorForTokenClassification(processing_class, max_length=args.max_length) + + if "input_ids" not in train_dataset.column_names: + with PartialState().main_process_first(): + fn_kwargs = { + "tokenizer": processing_class, + "step_separator": args.step_separator, + "max_length": args.max_length, + "max_prompt_length": args.max_prompt_length, + "max_completion_length": args.max_completion_length, + "train_on_last_step_only": args.train_on_last_step_only, + } + train_fn_kwargs = {**fn_kwargs, "is_eval": False} + train_dataset = train_dataset.map( + self.tokenize_row, + fn_kwargs=train_fn_kwargs, + num_proc=args.dataset_num_proc, + remove_columns=train_dataset.features, + desc="Tokenizing train dataset", + features=features.Features( # needed to avoid map to cast labels to bool + { + "labels": features.Sequence(features.Value("int64")), + "input_ids": features.Sequence(features.Value("int64")), + } + ), + ) + + eval_fn_kwargs = {**fn_kwargs, "is_eval": True} + if eval_dataset is not None: + eval_dataset = eval_dataset.map( + self.tokenize_row, + fn_kwargs=eval_fn_kwargs, + num_proc=args.dataset_num_proc, + remove_columns=eval_dataset.features, + desc="Tokenizing eval dataset", + features=features.Features( # needed to avoid map to cast labels to bool + { + "labels": features.Sequence(features.Value("int64")), + "input_ids": features.Sequence(features.Value("int64")), + } + ), + ) + + super().__init__( + model=model, + args=args, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + processing_class=processing_class, + model_init=model_init, + compute_metrics=compute_metrics, + callbacks=callbacks, + optimizers=optimizers, + preprocess_logits_for_metrics=preprocess_logits_for_metrics, + ) + + # Add tags for models that have been loaded with the correct transformers version + if hasattr(self.model, "add_model_tags"): + self.model.add_model_tags(self._tag_names) + + @staticmethod + def tokenize_row( + features, + tokenizer, + step_separator, + max_length, + max_prompt_length, + max_completion_length, + train_on_last_step_only, + is_eval, + ): + r""" + Tokenize a row of the dataset. + + Args: + features (`dict[str, str]`): + Row of the dataset, should contain the keys `"prompt"`, `"completions"`, and `"labels"`. + tokenizer (`PreTrainedTokenizerBase`): + Tokenizer used to process the data. + step_separator (`str`): + Separator between steps in the completion. + max_length (`int` or `None`): + Maximum length of the sequences (prompt + completion). If `None`, the sequences are not truncated. + max_prompt_length (`int` or `None`): + Maximum length of the prompt. If `None`, the prompt is not truncated. + max_completion_length (`int` or `None`): + Maximum length of the completion sequences. If `None`, the completion sequences are not truncated. + train_on_last_step_only (`bool`): + Whether to train only on the last step. If `True`, the labels are `-100` for all tokens except the last + token of the completion. + is_eval (`bool`): + Whether the function is used to tokenize samples from a training or an evaluation dataset. Used only if + `train_on_last_step_only` is set to `True`. + + Returns: + `dict[str, list[int]]`: + Tokenized sequences with the keys `"input_ids"`, and `"labels". + + Example: + ```python + >>> from transformers import AutoTokenizer + + >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B") + >>> features = { + ... "prompt": "Which number is larger, 9.8 or 9.11?", + ... "completions": ["11 is greater than 8.", "Hence, 9.11 > 9.8."], + ... "labels": [True, False], + ... } + >>> PRMTrainer.tokenize_row( + ... features, tokenizer, "\n", max_completion_length=None, train_on_last_step_only=False, is_eval=False + ... ) + {'input_ids': [23085, 1372, 374, 8131, 11, 220, 24, 13, 23, 476, 220, 24, 13, 16, 16, 30, 16, 16, 374, 7046, 1091, 220, 23, 13, 198, 39, 763, 11, 220, 24, 13, 16, 16, 861, 220, 24, 13, 23, 13, 198], + 'labels': [-100, -100, -100, -100, -100, -100, -100, -100, 1, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 0]} + ``` + """ + # Tokenize the prompt and completions + prompt_ids = tokenizer(features["prompt"], add_special_tokens=False)["input_ids"] + completions_ids = [ + tokenizer(completion, add_special_tokens=False)["input_ids"] for completion in features["completions"] + ] + if train_on_last_step_only and not is_eval: + labels = [-100] * (len(features["labels"]) - 1) + [int(features["labels"][-1])] + else: + labels = [int(label) for label in features["labels"]] + + # Get the ID of the separator token and add it to the completions + separator_ids = tokenizer.encode(step_separator, add_special_tokens=False) + completions_ids = [completion + separator_ids for completion in completions_ids] + + # Create the label + labels = [[-100] * (len(completion) - 1) + [label] for completion, label in zip(completions_ids, labels)] + + # Join the completions and labels steps + completion_ids = list(chain(*completions_ids)) + labels = list(chain(*labels)) + + if tokenizer.bos_token_id is not None: + prompt_ids = [tokenizer.bos_token_id] + prompt_ids + + # Truncate prompt and completion sequences + if max_prompt_length is not None: + prompt_ids = prompt_ids[-max_prompt_length:] + if max_completion_length is not None: + completion_ids = completion_ids[:max_completion_length] + labels = labels[:max_completion_length] + + input_ids = prompt_ids + completion_ids + labels = [-100] * len(prompt_ids) + labels + + if max_length is not None: + input_ids = input_ids[:max_length] + labels = labels[:max_length] + + return {"input_ids": input_ids, "labels": labels} + + # Ensure the model card is saved along with the checkpoint + def _save_checkpoint(self, model, trial): + if self.args.hub_model_id is None: + model_name = Path(self.args.output_dir).name + else: + model_name = self.args.hub_model_id.split("/")[-1] + self.create_model_card(model_name=model_name) + super()._save_checkpoint(model, trial) + + def create_model_card( + self, + model_name: Optional[str] = None, + dataset_name: Optional[str] = None, + tags: Union[str, list[str], None] = None, + ): + """ + Creates a draft of a model card using the information available to the `Trainer`. + + Args: + model_name (`str` or `None`, *optional*, defaults to `None`): + Name of the model. + dataset_name (`str` or `None`, *optional*, defaults to `None`): + Name of the dataset used for training. + tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`): + Tags to be associated with the model card. + """ + if not self.is_world_process_zero(): + return + + if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path): + base_model = self.model.config._name_or_path + else: + base_model = None + + # normalize `tags` to a mutable set + if tags is None: + tags = set() + elif isinstance(tags, str): + tags = {tags} + else: + tags = set(tags) + + if hasattr(self.model.config, "unsloth_version"): + tags.add("unsloth") + + tags.update(self._tag_names) + + citation = textwrap.dedent("""\ + @article{uesato2022solving, + title = {{Solving Math Word Problems With Process- and Outcome-Based Feedback}}, + author = {Uesato, Jonathan and Kushman, Nate and Kumar, Ramana and Song, Francis and Siegel, Noah and Wang, Lisa and Creswell, Antonia and Irving, Geoffrey and Higgins, Irina}, + year = 2022, + journal = {arXiv preprint arXiv:2211.14275} + }""") + + model_card = generate_model_card( + base_model=base_model, + model_name=model_name, + hub_model_id=self.hub_model_id, + dataset_name=dataset_name, + tags=tags, + wandb_url=wandb.run.get_url() if is_wandb_available() and wandb.run is not None else None, + trainer_name="PRM", + trainer_citation=citation, + paper_title="Solving math word problems with process-and outcome-based feedback", + ) + + model_card.save(os.path.join(self.args.output_dir, "README.md")) +class UnslothPRMTrainer(_UnslothPRMTrainer): + """ + + Initialize PRMTrainer. + + Args: + model (`transformers.PreTrainedModel`): + The model to train, preferably an `AutoModelForTokenClassification`. + args (`PRMConfig`): + The arguments to use for training. + data_collator (`transformers.DataCollator`): + The data collator to use for training. If None is specified, the default data collator + (`DataCollatorForTokenClassification`) will be used which will pad the sequences to the maximum length of + the sequences in the batch, given a dataset of paired sequences. + train_dataset (`datasets.Dataset`): + The dataset to use for training. + eval_dataset (`datasets.Dataset`): + The dataset to use for evaluation. + processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*): + Processing class used to process the data. If provided, will be used to automatically process the inputs + for the model, and it will be saved along the model to make it easier to rerun an interrupted training or + reuse the fine-tuned model. + model_init (`Callable[[], transformers.PreTrainedModel]`): + The model initializer to use for training. If None is specified, the default model initializer will be + used. + compute_metrics (`Callable[[transformers.EvalPrediction], dict]`, *optional* defaults to `compute_accuracy`): + The metrics to use for evaluation. If no metrics are specified, the default metric (`compute_accuracy`) + will be used. + callbacks (`list[transformers.TrainerCallback]`): + The callbacks to use for training. + optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`): + The optimizer and scheduler to use for training. + preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`): + The function to use to preprocess the logits before computing the metrics. + peft_config (`dict`, defaults to `None`): + The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in + a PEFT model. + + """ + def __init__( + self, + model = None, + args = None, + data_collator = None, + train_dataset = None, + eval_dataset = None, + processing_class = None, + model_init = None, + compute_metrics = None, + callbacks = None, + preprocess_logits_for_metrics = None, + peft_config = None, + **kwargs + ): + if args is None: args = UnslothPRMConfig() + use_bf16 = getattr(args, 'bf16', False) + if type(use_bf16) is not bool: use_bf16 = False + use_fp16 = getattr(args, 'fp16', False) + if type(use_fp16) is not bool: use_fp16 = False + force_float32 = False + if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1': + print('Unsloth: Switching to float32 training since model cannot work with float16') + force_float32 = True + mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') + dtype = getattr(model.config, 'torch_dtype', None) + if dtype is None: dtype = model.get_input_embeddings().dtype + from unsloth_zoo.utils import _get_dtype + dtype = _get_dtype(dtype) + float16 = dtype == torch.float16 + if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`') + if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`') + if force_float32: + args.fp16 = False + args.bf16 = False + os.environ['ACCELERATE_MIXED_PRECISION'] = 'no' + elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32': + args.fp16 = float16 + args.bf16 = not float16 + os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16' + if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no': + args.eval_strategy = 'steps' + if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1 + ga_steps = getattr(args, 'gradient_accumulation_steps', None) + if ga_steps is not None and ga_steps > 1: + from transformers import __version__ as transformers_version + if Version(transformers_version) <= Version('4.45.2'): + print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n' + '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`') + if getattr(args, 'eval_strategy', 'no') != 'no': + eval_bsz = getattr(args, 'per_device_eval_batch_size', 8) + if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size + if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps + fp16_full_eval = getattr(args, 'fp16_full_eval', False) + if type(fp16_full_eval) is not bool: fp16_full_eval = False + bf16_full_eval = getattr(args, 'bf16_full_eval', False) + if type(bf16_full_eval) is not bool: bf16_full_eval = False + if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True + if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False + if force_float32: + args.bf16_full_eval = False + args.fp16_full_eval = False + elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16': + args.bf16_full_eval = True + args.fp16_full_eval = False + elif not bf16_full_eval and not fp16_full_eval: + args.bf16_full_eval = args.bf16 + args.fp16_full_eval = args.fp16 + _output_logits = False + if locals().get('compute_metrics', None) is not None: _output_logits = True + if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True + if _output_logits: + os.environ['UNSLOTH_RETURN_LOGITS'] = '1' + if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'): + pass + else: + model_max_seq_length = getattr(model, 'max_seq_length', None) + args_max_seq_length = getattr(args, 'max_seq_length', None) + if args_max_seq_length is None and model_max_seq_length is not None: + max_seq_length = model.max_seq_length + if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length + if model is not None and hasattr(model, 'for_training'): + model.for_training() + if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right' + if 'processing_class' in locals(): + if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right' + if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right' + __tokenizer = processing_class if 'processing_class' in locals() else tokenizer + from unsloth_zoo.vision_utils import UnslothVisionDataCollator + if not isinstance(data_collator, UnslothVisionDataCollator): + if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer, mlm = False, mlm_probability = 0.0) + elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names: + data_collator = DataCollatorForSeq2Seq(__tokenizer) + else: + if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False + if hasattr(args, 'dataset_text_field'): args.dataset_text_field = '' + if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True} + if not isinstance(data_collator, UnslothVisionDataCollator): + if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'): + if isinstance(data_collator, DataCollatorForSeq2Seq): + data_collator = DataCollatorForSeq2Seq(__tokenizer.tokenizer) + else: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer.tokenizer, mlm = False, mlm_probability = 0.0) + other_metrics = [] + + from unsloth_zoo.logging_utils import PatchRLStatistics + PatchRLStatistics('prm_trainer', other_metrics) + + super().__init__( + model = model, + args = args, + data_collator = data_collator, + train_dataset = train_dataset, + eval_dataset = eval_dataset, + processing_class = processing_class, + model_init = model_init, + compute_metrics = compute_metrics, + callbacks = callbacks, + preprocess_logits_for_metrics = preprocess_logits_for_metrics, + peft_config = peft_config,**kwargs) + if hasattr(self, 'neftune_hook_handle'): + self.neftune_hook_handle.remove() + if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle + if getattr(args, 'neftune_noise_alpha', None) is not None: + model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha + pass + +pass diff --git a/unsloth_compiled_cache/UnslothRLOOTrainer.py b/unsloth_compiled_cache/UnslothRLOOTrainer.py new file mode 100644 index 0000000000000000000000000000000000000000..0b6ba759487515f206318db2d4a1911e7c80ca2d --- /dev/null +++ b/unsloth_compiled_cache/UnslothRLOOTrainer.py @@ -0,0 +1,1155 @@ +""" +2025.7.4 +2025.7.3 +4.53.2 +0.19.1 +__UNSLOTH_VERSIONING__ +""" +from torch import Tensor +import torch +import torch.nn as nn +from torch.nn import functional as F +from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable +from trl.trainer.rloo_trainer import (Accelerator, BaseImageProcessor, Callable, CallbackHandler, DEFAULT_CALLBACKS, DEFAULT_PROGRESS_CALLBACK, DataCollatorWithPadding, DataLoader, Dataset, ExportableState, FeatureExtractionMixin, GenerationConfig, INVALID_LOGPROB, OnlineTrainerState, Optional, Path, PreTrainedTokenizerBase, PrinterCallback, ProcessorMixin, RLOOConfig, RLOOTrainer, Trainer, TrainerCallback, TrainerControl, Union, batch_generation, broadcast, defaultdict, disable_dropout_in_model, empty_cache, exact_div, first_true_indices, forward, gather_object, gc, generate_model_card, get_comet_experiment_url, get_reporting_integration_callbacks, get_reward, is_rich_available, is_wandb_available, log_table_to_comet_experiment, math, nn, np, os, pd, prepare_deepspeed, print_rich_table, textwrap, time, torch, truncate_response, unwrap_model_for_generation, wandb, Optional, Trainer, os, torch) + + +import os +from typing import * +from dataclasses import dataclass, field +from packaging.version import Version +import torch +import numpy as np +from contextlib import nullcontext +from torch.nn import functional as F +from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling + +torch_compile_options = { + "epilogue_fusion" : True, + "max_autotune" : False, + "shape_padding" : True, + "trace.enabled" : False, + "triton.cudagraphs" : False, +} + +@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) +def selective_log_softmax(logits, index): + logits = logits.to(torch.float32) + selected_logits = torch.gather(logits, dim = -1, index = index.unsqueeze(-1)).squeeze(-1) + # loop to reduce peak mem consumption + # logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits]) + logsumexp_values = torch.logsumexp(logits, dim = -1) + per_token_logps = selected_logits - logsumexp_values # log_softmax(x_i) = x_i - logsumexp(x) + return per_token_logps +@dataclass +class UnslothRLOOConfig(RLOOConfig): + """ + + Configuration class for the [`RLOOTrainer`]. + + This class includes only the parameters that are specific to RLOO training. For a full list of training arguments, + please refer to the [`~transformers.TrainingArguments`] and [`OnPolicyConfig`] documentation. Note that default + values in this class may differ from those in [`~transformers.TrainingArguments`]. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + exp_name (`str`, *optional*, defaults to `os.path.basename(__file__)[: -len(".py")]`): + Name of this experiment. + reward_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`): + Path to the reward model. + num_ppo_epochs (`int`, *optional*, defaults to `4`): + Number of epochs to train. + whiten_rewards (`bool`, *optional*, defaults to `False`): + Whether to whiten the rewards. + kl_coef (`float`, *optional*, defaults to `0.05`): + KL coefficient. + cliprange (`float`, *optional*, defaults to `0.2`): + Clip range. + rloo_k (`int`, *optional*, defaults to `2`): + REINFORCE Leave-One-Out (RLOO) number of online samples per prompt. + normalize_reward (`bool`, *optional*, defaults to `False`): + Whether to normalize rewards. + reward_clip_range (`float`, *optional*, defaults to `10.0`): + Clip range for rewards. + normalize_advantage (`bool`, *optional*, defaults to `False`): + Whether to normalize advantages. + token_level_kl (`bool`, *optional*, defaults to `True`): + Whether to use token-level KL penalty or sequence-level KL penalty. + ds3_gather_for_generation (`bool`, *optional*, defaults to `True`): + This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation, + improving generation speed. However, disabling this option allows training models that exceed the VRAM + capacity of a single GPU, albeit at the cost of slower generation. + + """ + vllm_sampling_params: Optional[Any] = field( + default = None, + metadata = {'help': 'vLLM SamplingParams'}, + ) + unsloth_num_chunks : Optional[int] = field( + default = -1, + metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'}, + ) + def __init__( + self, + output_dir = None, + overwrite_output_dir = None, + do_train = False, + do_eval = False, + do_predict = False, + eval_strategy = 'no', + prediction_loss_only = False, + per_device_train_batch_size = 4, + per_device_eval_batch_size = 4, + per_gpu_train_batch_size = None, + per_gpu_eval_batch_size = None, + gradient_accumulation_steps = 2, + eval_accumulation_steps = 2, + eval_delay = 0, + torch_empty_cache_steps = 250, + learning_rate = 5e-05, + weight_decay = 0.01, + adam_beta1 = 0.9, + adam_beta2 = 0.999, + adam_epsilon = 1e-08, + max_grad_norm = 1.0, + num_train_epochs = 3.0, + max_steps = -1, + lr_scheduler_type = 'linear', + warmup_ratio = 0.1, + warmup_steps = 0, + log_level = 'passive', + log_level_replica = 'warning', + log_on_each_node = True, + logging_dir = None, + logging_strategy = 'steps', + logging_first_step = False, + logging_steps = 1, + logging_nan_inf_filter = False, + save_strategy = 'steps', + save_steps = 500, + save_total_limit = None, + save_safetensors = True, + save_on_each_node = False, + save_only_model = False, + restore_callback_states_from_checkpoint = False, + no_cuda = False, + use_cpu = False, + use_mps_device = False, + seed = 3407, + data_seed = 3407, + jit_mode_eval = False, + use_ipex = False, + bf16 = False, + fp16 = False, + fp16_opt_level = 'O1', + half_precision_backend = 'auto', + bf16_full_eval = False, + fp16_full_eval = False, + tf32 = None, + local_rank = -1, + ddp_backend = None, + tpu_num_cores = None, + tpu_metrics_debug = False, + debug = '', + dataloader_drop_last = False, + eval_steps = None, + dataloader_num_workers = 0, + dataloader_prefetch_factor = None, + past_index = -1, + run_name = None, + disable_tqdm = None, + remove_unused_columns = True, + label_names = None, + load_best_model_at_end = False, + metric_for_best_model = None, + greater_is_better = None, + ignore_data_skip = False, + fsdp = '', + fsdp_min_num_params = 0, + fsdp_config = None, + fsdp_transformer_layer_cls_to_wrap = None, + accelerator_config = None, + deepspeed = None, + label_smoothing_factor = 0.0, + optim = 'adamw_8bit', + optim_args = None, + adafactor = False, + group_by_length = False, + length_column_name = 'length', + report_to = None, + ddp_find_unused_parameters = None, + ddp_bucket_cap_mb = None, + ddp_broadcast_buffers = None, + dataloader_pin_memory = True, + dataloader_persistent_workers = False, + skip_memory_metrics = True, + use_legacy_prediction_loop = False, + push_to_hub = False, + resume_from_checkpoint = None, + hub_model_id = None, + hub_strategy = 'every_save', + hub_token = None, + hub_private_repo = None, + hub_always_push = False, + hub_revision = None, + gradient_checkpointing = False, + gradient_checkpointing_kwargs = None, + include_inputs_for_metrics = False, + eval_do_concat_batches = True, + fp16_backend = 'auto', + push_to_hub_model_id = None, + push_to_hub_organization = None, + push_to_hub_token = None, + mp_parameters = '', + auto_find_batch_size = False, + full_determinism = False, + torchdynamo = None, + ray_scope = 'last', + ddp_timeout = 1800, + torch_compile = False, + torch_compile_backend = None, + torch_compile_mode = None, + include_tokens_per_second = False, + include_num_input_tokens_seen = False, + neftune_noise_alpha = None, + optim_target_modules = None, + batch_eval_metrics = False, + eval_on_start = False, + use_liger_kernel = False, + liger_kernel_config = None, + eval_use_gather_object = False, + average_tokens_across_devices = False, + dataset_num_proc = None, + num_mini_batches = 1, + total_episodes = None, + local_rollout_forward_batch_size = 64, + num_sample_generations = 10, + response_length = 53, + stop_token = None, + stop_token_id = None, + temperature = 0.7, + missing_eos_penalty = None, + sft_model_path = 'EleutherAI/pythia-160m', + world_size = None, + num_total_batches = None, + micro_batch_size = None, + local_batch_size = None, + batch_size = None, + local_mini_batch_size = None, + mini_batch_size = None, + exp_name = 'rloo_config', + reward_model_path = 'EleutherAI/pythia-160m', + num_ppo_epochs = 4, + whiten_rewards = False, + kl_coef = 0.05, + cliprange = 0.2, + rloo_k = 2, + normalize_reward = False, + reward_clip_range = 10.0, + normalize_advantage = False, + token_level_kl = False, + ds3_gather_for_generation = True, + vllm_sampling_params = None, + unsloth_num_chunks = -1, + **kwargs, + ): + if learning_rate < 1e-7: raise FloatingPointError(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!') + if learning_rate > 1: raise OverflowError(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!') + if output_dir is None and save_strategy == 'steps' and save_steps == 500: + output_dir = 'unsloth_training_checkpoints' + save_strategy = 'no' + if dataset_num_proc is None: + from multiprocessing import cpu_count + dataset_num_proc = cpu_count() + if temperature <= 0: + raise MathError('Unsloth: Please set a positive non-zero temperature since your results will be wrong.') + elif temperature >= 10: + raise MathError('Unsloth: Please set a positive non-zero temperature less than 10, since sampling will be quite erratic.') + + + super().__init__( + output_dir = output_dir, + overwrite_output_dir = overwrite_output_dir, + do_train = do_train, + do_eval = do_eval, + do_predict = do_predict, + eval_strategy = eval_strategy, + prediction_loss_only = prediction_loss_only, + per_device_train_batch_size = per_device_train_batch_size, + per_device_eval_batch_size = per_device_eval_batch_size, + per_gpu_train_batch_size = per_gpu_train_batch_size, + per_gpu_eval_batch_size = per_gpu_eval_batch_size, + gradient_accumulation_steps = gradient_accumulation_steps, + eval_accumulation_steps = eval_accumulation_steps, + eval_delay = eval_delay, + torch_empty_cache_steps = torch_empty_cache_steps, + learning_rate = learning_rate, + weight_decay = weight_decay, + adam_beta1 = adam_beta1, + adam_beta2 = adam_beta2, + adam_epsilon = adam_epsilon, + max_grad_norm = max_grad_norm, + num_train_epochs = num_train_epochs, + max_steps = max_steps, + lr_scheduler_type = lr_scheduler_type, + warmup_ratio = warmup_ratio, + warmup_steps = warmup_steps, + log_level = log_level, + log_level_replica = log_level_replica, + log_on_each_node = log_on_each_node, + logging_dir = logging_dir, + logging_strategy = logging_strategy, + logging_first_step = logging_first_step, + logging_steps = logging_steps, + logging_nan_inf_filter = logging_nan_inf_filter, + save_strategy = save_strategy, + save_steps = save_steps, + save_total_limit = save_total_limit, + save_safetensors = save_safetensors, + save_on_each_node = save_on_each_node, + save_only_model = save_only_model, + restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint, + no_cuda = no_cuda, + use_cpu = use_cpu, + use_mps_device = use_mps_device, + seed = seed, + data_seed = data_seed, + jit_mode_eval = jit_mode_eval, + use_ipex = use_ipex, + bf16 = bf16, + fp16 = fp16, + fp16_opt_level = fp16_opt_level, + half_precision_backend = half_precision_backend, + bf16_full_eval = bf16_full_eval, + fp16_full_eval = fp16_full_eval, + tf32 = tf32, + local_rank = local_rank, + ddp_backend = ddp_backend, + tpu_num_cores = tpu_num_cores, + tpu_metrics_debug = tpu_metrics_debug, + debug = debug, + dataloader_drop_last = dataloader_drop_last, + eval_steps = eval_steps, + dataloader_num_workers = dataloader_num_workers, + dataloader_prefetch_factor = dataloader_prefetch_factor, + past_index = past_index, + run_name = run_name, + disable_tqdm = disable_tqdm, + remove_unused_columns = remove_unused_columns, + label_names = label_names, + load_best_model_at_end = load_best_model_at_end, + metric_for_best_model = metric_for_best_model, + greater_is_better = greater_is_better, + ignore_data_skip = ignore_data_skip, + fsdp = fsdp, + fsdp_min_num_params = fsdp_min_num_params, + fsdp_config = fsdp_config, + fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap, + accelerator_config = accelerator_config, + deepspeed = deepspeed, + label_smoothing_factor = label_smoothing_factor, + optim = optim, + optim_args = optim_args, + adafactor = adafactor, + group_by_length = group_by_length, + length_column_name = length_column_name, + report_to = report_to, + ddp_find_unused_parameters = ddp_find_unused_parameters, + ddp_bucket_cap_mb = ddp_bucket_cap_mb, + ddp_broadcast_buffers = ddp_broadcast_buffers, + dataloader_pin_memory = dataloader_pin_memory, + dataloader_persistent_workers = dataloader_persistent_workers, + skip_memory_metrics = skip_memory_metrics, + use_legacy_prediction_loop = use_legacy_prediction_loop, + push_to_hub = push_to_hub, + resume_from_checkpoint = resume_from_checkpoint, + hub_model_id = hub_model_id, + hub_strategy = hub_strategy, + hub_token = hub_token, + hub_private_repo = hub_private_repo, + hub_always_push = hub_always_push, + hub_revision = hub_revision, + gradient_checkpointing = gradient_checkpointing, + gradient_checkpointing_kwargs = gradient_checkpointing_kwargs, + include_inputs_for_metrics = include_inputs_for_metrics, + eval_do_concat_batches = eval_do_concat_batches, + fp16_backend = fp16_backend, + push_to_hub_model_id = push_to_hub_model_id, + push_to_hub_organization = push_to_hub_organization, + push_to_hub_token = push_to_hub_token, + mp_parameters = mp_parameters, + auto_find_batch_size = auto_find_batch_size, + full_determinism = full_determinism, + torchdynamo = torchdynamo, + ray_scope = ray_scope, + ddp_timeout = ddp_timeout, + torch_compile = torch_compile, + torch_compile_backend = torch_compile_backend, + torch_compile_mode = torch_compile_mode, + include_tokens_per_second = include_tokens_per_second, + include_num_input_tokens_seen = include_num_input_tokens_seen, + neftune_noise_alpha = neftune_noise_alpha, + optim_target_modules = optim_target_modules, + batch_eval_metrics = batch_eval_metrics, + eval_on_start = eval_on_start, + use_liger_kernel = use_liger_kernel, + liger_kernel_config = liger_kernel_config, + eval_use_gather_object = eval_use_gather_object, + average_tokens_across_devices = average_tokens_across_devices, + dataset_num_proc = dataset_num_proc, + num_mini_batches = num_mini_batches, + total_episodes = total_episodes, + local_rollout_forward_batch_size = local_rollout_forward_batch_size, + num_sample_generations = num_sample_generations, + response_length = response_length, + stop_token = stop_token, + stop_token_id = stop_token_id, + temperature = temperature, + missing_eos_penalty = missing_eos_penalty, + sft_model_path = sft_model_path, + world_size = world_size, + num_total_batches = num_total_batches, + micro_batch_size = micro_batch_size, + local_batch_size = local_batch_size, + batch_size = batch_size, + local_mini_batch_size = local_mini_batch_size, + mini_batch_size = mini_batch_size, + exp_name = exp_name, + reward_model_path = reward_model_path, + num_ppo_epochs = num_ppo_epochs, + whiten_rewards = whiten_rewards, + kl_coef = kl_coef, + cliprange = cliprange, + rloo_k = rloo_k, + normalize_reward = normalize_reward, + reward_clip_range = reward_clip_range, + normalize_advantage = normalize_advantage, + token_level_kl = token_level_kl, + ds3_gather_for_generation = ds3_gather_for_generation,**kwargs) + self.vllm_sampling_params = vllm_sampling_params + self.unsloth_num_chunks = unsloth_num_chunks +pass + +class _UnslothRLOOTrainer(Trainer): + _tag_names = ["trl", "rloo"] + + def __init__( + self, + config: RLOOConfig, + processing_class: Optional[ + Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin] + ], + policy: nn.Module, + ref_policy: nn.Module, + reward_model: Union[nn.Module, Callable[[list[str]], list[float]]], + train_dataset: Dataset, + data_collator: Optional[DataCollatorWithPadding] = None, + eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None, + # less commonly used + optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), + callbacks: Optional[list[TrainerCallback]] = None, + ) -> None: + if ref_policy is policy: + raise ValueError( + "`policy` and `ref_policy` cannot be the same object. If you want `ref_policy` to be the " + "same as `policy`, you must mass a copy of it, or `None` if you use peft." + ) + + self.args = config + args = config + self.processing_class = processing_class + self.policy = policy + + # Define the collator if not provided + if data_collator is None: + data_collator = DataCollatorWithPadding(self.processing_class) + + self.policy.generation_config.eos_token_id = ( + None # disable `pad_token_id` and `eos_token_id` because we just want to + ) + self.policy.generation_config.pad_token_id = None # generate tokens without truncation / padding + + self.ref_policy = ref_policy + self.reward_model = reward_model + self.train_dataset = train_dataset + self.train_dataset_len = len(train_dataset) + self.data_collator = data_collator + self.eval_dataset = eval_dataset + self.optimizer, self.lr_scheduler = optimizers + self.optimizer_cls_and_kwargs = None # needed for transformers >= 4.47 + + ######### + # calculate various batch sizes + ######### + if args.total_episodes is None: # allow the users to define episodes in terms of epochs. + args.total_episodes = int(args.num_train_epochs * self.train_dataset_len) + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps) + self.accelerator = accelerator + args.world_size = accelerator.num_processes + args.local_batch_size = ( + args.per_device_train_batch_size * args.gradient_accumulation_steps * args.num_mini_batches + ) + args.micro_batch_size = int(args.per_device_train_batch_size * args.world_size) + args.batch_size = int(args.local_batch_size * args.world_size) + args.mini_batch_size = exact_div( + args.batch_size, args.num_mini_batches, "`batch_size` must be a multiple of `num_mini_batches`" + ) + args.local_mini_batch_size = exact_div( + args.local_batch_size, args.num_mini_batches, "`local_batch_size` must be a multiple of `num_mini_batches`" + ) + args.num_total_batches = math.ceil( + args.total_episodes / args.batch_size + ) # we may train for more than `total_episodes` + time_tensor = torch.tensor(int(time.time()), device=accelerator.device) + time_int = broadcast(time_tensor, 0).item() # avoid different timestamps across processes + args.run_name = f"{args.exp_name}__{args.seed}__{time_int}" + self.local_seed = args.seed + accelerator.process_index * 100003 # Prime + if args.num_sample_generations > 0: + self.sample_generations_freq = max(1, args.num_total_batches // args.num_sample_generations) + self.local_dataloader_batch_size = exact_div( + args.local_batch_size, args.rloo_k, "`local_batch_size` must be a multiple of rloo_k" + ) # RLOO logic: needed because RLOO repeats the same prompt args.rloo_k times + + ######### + # setup model, optimizer, and others + ######### + for module in [policy, ref_policy, reward_model]: + if isinstance(module, nn.Module): + disable_dropout_in_model(module) + if args.stop_token and args.stop_token == "eos": + args.stop_token_id = self.processing_class.eos_token_id + self.model = policy + self.create_optimizer_and_scheduler( + num_training_steps=args.num_total_batches + ) # note that we are calling `self.lr_scheduler.step[]` manually only at the batch level + + ######### + ### trainer specifics + ######### + default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to) + self.callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks + self.callback_handler = CallbackHandler( + self.callbacks, self.model, self.processing_class, self.optimizer, self.lr_scheduler + ) + self.add_callback(PrinterCallback if self.args.disable_tqdm else DEFAULT_PROGRESS_CALLBACK) + self.control = TrainerControl() + self.state = OnlineTrainerState( + is_local_process_zero=self.is_local_process_zero(), + is_world_process_zero=self.is_world_process_zero(), + stateful_callbacks=[ + cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState) + ], + ) + + self.current_flos = 0 + self.hp_search_backend = None + self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None + self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None + # Create distant repo and output directory if needed + self.hub_model_id = None + if self.args.push_to_hub: + self.init_hf_repo() + if self.args.should_save: + os.makedirs(self.args.output_dir, exist_ok=True) + self.backup_model = None + + # Add tags for models that have been loaded with the correct transformers version + if hasattr(self.model, "add_model_tags"): + self.model.add_model_tags(self._tag_names) + + ######### + ### setup dataloader + ######### + self.dataloader = DataLoader( + self.train_dataset, + batch_size=self.local_dataloader_batch_size, + shuffle=True, + collate_fn=self.data_collator, + drop_last=True, # needed; otherwise the last batch will be of ragged shape + ) + # sync random states for DataLoader[shuffle=True] before `accelerator.prepare` + # see https://gist.github.com/vwxyzjn/2581bff1e48e185e0b85b6dfe1def79c + torch.manual_seed(args.seed) + self.model, self.optimizer, self.dataloader = accelerator.prepare(self.model, self.optimizer, self.dataloader) + torch.manual_seed(self.local_seed) # reset the local seed again + + self.eval_dataloader = DataLoader( + self.eval_dataset, + batch_size=args.per_device_eval_batch_size, + collate_fn=self.data_collator, + drop_last=True, + ) # no need to shuffle eval dataset + self.eval_dataloader = accelerator.prepare(self.eval_dataloader) + + if self.is_deepspeed_enabled: + if isinstance(self.reward_model, nn.Module): + self.reward_model = prepare_deepspeed( + self.reward_model, args.per_device_train_batch_size, args.fp16, args.bf16 + ) + self.ref_policy = prepare_deepspeed( + self.ref_policy, args.per_device_train_batch_size, args.fp16, args.bf16 + ) + self.deepspeed = self.model + else: + self.ref_policy = self.ref_policy.to(self.accelerator.device) + if isinstance(self.reward_model, nn.Module): + self.reward_model = self.reward_model.to(self.accelerator.device) + + def get_train_dataloader(self) -> DataLoader: + return self.dataloader + + def get_eval_dataloader(self) -> DataLoader: + return self.eval_dataloader + + def train(self): + args = self.args + accelerator = self.accelerator + optimizer = self.optimizer + model = self.model + self.model_wrapped = self.model + ref_policy = self.ref_policy + reward_model = self.reward_model + processing_class = self.processing_class + dataloader = self.dataloader + device = accelerator.device + + def repeat_generator(): + while True: + yield from dataloader + + iter_dataloader = iter(repeat_generator()) + generation_config = GenerationConfig( + max_new_tokens=args.response_length, + temperature=(args.temperature + 1e-7), + top_k=0.0, + top_p=1.0, + do_sample=True, + ) + + accelerator.print("===training policy===") + start_time = time.time() + stats_shape = (args.num_ppo_epochs, args.num_mini_batches, args.gradient_accumulation_steps) + approxkl_stats = torch.zeros(stats_shape, device=device) + pg_clipfrac_stats = torch.zeros(stats_shape, device=device) + pg_loss_stats = torch.zeros(stats_shape, device=device) + vf_clipfrac_stats = torch.zeros(stats_shape, device=device) + entropy_stats = torch.zeros(stats_shape, device=device) + ratio_stats = torch.zeros(stats_shape, device=device) + model.train() + + # trainer state initialization + self.state.global_step = 0 + self.state.episode = 0 + self.state.max_steps = (args.num_total_batches * args.num_mini_batches) // 2 + self.state.num_train_epochs = args.total_episodes / self.train_dataset_len + # Compute absolute values for logging, eval, and save if given as ratio + if args.logging_steps is not None: + if args.logging_steps < 1: + self.state.logging_steps = math.ceil(self.state.max_steps * args.logging_steps) + else: + self.state.logging_steps = args.logging_steps + if args.eval_steps is not None: + if args.eval_steps < 1: + self.state.eval_steps = math.ceil(self.state.max_steps * args.eval_steps) + else: + self.state.eval_steps = args.eval_steps + if args.save_steps is not None: + if args.save_steps < 1: + self.state.save_steps = math.ceil(self.state.max_steps * args.save_steps) + else: + self.state.save_steps = args.save_steps + self.control = self.callback_handler.on_train_begin(args, self.state, self.control) + + for update in range(1, args.num_total_batches + 1): + self.state.episode += 1 * args.batch_size + data = next(iter_dataloader) + with torch.no_grad(): + queries = data["input_ids"].to(device) + queries = queries.repeat(args.rloo_k, 1) + context_length = queries.shape[1] + responses = [] + postprocessed_responses = [] + logprobs = [] + ref_logprobs = [] + scores = [] + sequence_lengths = [] + + # Generate responses and compute logprobs + with unwrap_model_for_generation( + self.model, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation + ) as unwrapped_model: + query_responses, logitss = batch_generation( + unwrapped_model, + queries, + args.local_rollout_forward_batch_size, + processing_class.pad_token_id, + generation_config, + ) + + # Process responses in batches + for i in range(0, queries.shape[0], args.local_rollout_forward_batch_size): + query = queries[i : i + args.local_rollout_forward_batch_size] + query_response = query_responses[i : i + args.local_rollout_forward_batch_size] + response = query_response[:, context_length:] + logits = logitss[i : i + args.local_rollout_forward_batch_size] + logprob = selective_log_softmax(logits, response) + del logits + empty_cache() + + ref_output = forward(ref_policy, query_response, processing_class.pad_token_id) + ref_logits = ref_output.logits[:, context_length - 1 : -1] + ref_logits /= args.temperature + 1e-7 + ref_logprob = selective_log_softmax(ref_logits, response) + del ref_output, ref_logits + empty_cache() + + # Response Processing 1. truncate response after the first occurrence of `stop_token_id` + postprocessed_response = response + if args.stop_token_id is not None: # handle the edge case when stop_token_id exists but is 0 + postprocessed_response = truncate_response( + args.stop_token_id, processing_class.pad_token_id, response + ) + + # Response Processing 2. run reward model on the truncated responses + postprocessed_query_response = torch.cat((query, postprocessed_response), 1) + sequence_length = first_true_indices(postprocessed_response == processing_class.pad_token_id) - 1 + + if isinstance(reward_model, nn.Module): + _, score, _ = get_reward( + reward_model, postprocessed_query_response, processing_class.pad_token_id, context_length + ) + else: + score = torch.tensor( + reward_model( + processing_class.batch_decode(postprocessed_query_response, skip_special_tokens=True) + ), + dtype=torch.float, + ).to(device) + + # Store batch results + responses.append(response) + postprocessed_responses.append(postprocessed_response) + logprobs.append(logprob) + ref_logprobs.append(ref_logprob) + sequence_lengths.append(sequence_length) + scores.append(score) + + # Concatenate all batched results + responses = torch.cat(responses, 0) + postprocessed_responses = torch.cat(postprocessed_responses, 0) + logprobs = torch.cat(logprobs, 0) + ref_logprobs = torch.cat(ref_logprobs, 0) + sequence_lengths = torch.cat(sequence_lengths, 0) + scores = torch.cat(scores, 0) + del (logprob, ref_logprob, score) + empty_cache() + gc.collect() + + # Response Processing 3. filter response. Ensure that the sample contains stop_token_id + # responses not passing that filter will receive a low (fixed) score + # only query humans on responses that pass that filter + contain_eos_token = torch.any(postprocessed_responses == processing_class.eos_token_id, dim=-1) + if args.missing_eos_penalty is not None: + scores[~contain_eos_token] -= self.args.missing_eos_penalty + # accelerator.print(f"{scores=}, {(contain_eos_token.sum() / len(contain_eos_token))=}") + + # be very careful with `padding_mask_p1`; see https://excalidraw.com/#json=LWnzG4w2k5DjF_EOL_xPt,e2w3a-hFJ_gX5vOfeyXGTw + response_idxs = torch.arange(responses.shape[1], device=responses.device).repeat(responses.shape[0], 1) + padding_mask = response_idxs > sequence_lengths.unsqueeze(1) + logprobs = torch.masked_fill(logprobs, padding_mask, INVALID_LOGPROB) + ref_logprobs = torch.masked_fill(ref_logprobs, padding_mask, INVALID_LOGPROB) + + # 4. compute rewards + # Compute KL divergence + kl = logprobs - ref_logprobs + + # Normalize rewards + if args.normalize_reward: + scores = (scores - scores.mean()) / (scores.std() + 1e-8) + scores = torch.clamp(scores, -args.reward_clip_range, args.reward_clip_range) + + # Compute total reward with KL penalty + if args.token_level_kl: + # Token-level KL penalty: apply KL penalty per token + kl_reward = -args.kl_coef * kl + + # Get the index of the last non-padded token for each sequence + eos_indices = padding_mask.size(1) - 1 - padding_mask.long().fliplr().argmax(dim=1, keepdim=True) + last_reward = torch.zeros_like(kl) + # Ensure scores has correct shape and type + scores_shaped = scores.reshape(-1, 1).to(kl.dtype) + last_reward.scatter_(dim=1, index=eos_indices, src=scores_shaped) + + # Combine KL reward and last reward + non_score_reward = kl_reward.sum(1) # Keep this for logging + reward = last_reward + kl_reward + rlhf_reward = reward.sum(1) # Sum across sequence length + else: + # Sequence-level KL penalty: sum KL across tokens first + sequence_kl = kl.sum(1) + non_score_reward = -args.kl_coef * sequence_kl + rlhf_reward = non_score_reward + scores + + # vectorized RLOO advantages implementation + rlhf_reward = rlhf_reward.reshape(args.rloo_k, -1) + baseline = (rlhf_reward.sum(0) - rlhf_reward) / (args.rloo_k - 1) + advantages = rlhf_reward - baseline + advantages = advantages.flatten() + + # Normalize advantages + if args.normalize_advantage: + advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) + + empty_cache() + + # Do multiple epochs of PPO training, with a fresh random shuffle in each epoch + for ppo_epoch_idx in range(args.num_ppo_epochs): + b_inds = np.random.permutation(args.local_batch_size) + minibatch_idx = 0 + for mini_batch_start in range(0, args.local_batch_size, args.local_mini_batch_size): + mini_batch_end = mini_batch_start + args.local_mini_batch_size + mini_batch_inds = b_inds[mini_batch_start:mini_batch_end] + gradient_accumulation_idx = 0 + for micro_batch_start in range(0, args.local_mini_batch_size, args.per_device_train_batch_size): + with accelerator.accumulate(model): + micro_batch_end = micro_batch_start + args.per_device_train_batch_size + micro_batch_inds = mini_batch_inds[micro_batch_start:micro_batch_end] + + # Get batch data + mb_advantage = advantages[micro_batch_inds] + mb_responses = responses[micro_batch_inds] + mb_query_responses = query_responses[micro_batch_inds] + mb_logprobs = logprobs[micro_batch_inds] + + # Forward pass + output = forward(model, mb_query_responses, processing_class.pad_token_id) + logits = output.logits[:, context_length - 1 : -1] + logits /= args.temperature + 1e-7 + + # Compute new logprobs + new_logprobs = selective_log_softmax(logits, mb_responses) + new_logprobs = torch.masked_fill( + new_logprobs, padding_mask[micro_batch_inds], INVALID_LOGPROB + ) + + # Compute probability ratios + new_ratio = (new_logprobs - mb_logprobs).exp() + new_logprobs = new_logprobs.sum(1) + mb_logprobs = mb_logprobs.sum(1) + logprobs_diff = new_logprobs - mb_logprobs + ratio = torch.exp(logprobs_diff) + + # PPO clipped loss + pg_losses = -mb_advantage * ratio + pg_losses2 = -mb_advantage * torch.clamp(ratio, 1.0 - args.cliprange, 1.0 + args.cliprange) + pg_loss_max = torch.max(pg_losses, pg_losses2) + pg_loss = pg_loss_max.mean() + + # Final loss + loss = pg_loss + + # Optimization step + accelerator.backward(loss) + optimizer.step() + optimizer.zero_grad() + + with torch.no_grad(): + pg_clipfrac = (pg_losses2 > pg_losses).float().mean() + prob_dist = torch.nn.functional.softmax(logits, dim=-1) + entropy = torch.logsumexp(logits, dim=-1) - torch.sum(prob_dist * logits, dim=-1) + approxkl = 0.5 * (logprobs_diff**2).mean() + approxkl_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = approxkl + pg_clipfrac_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = ( + pg_clipfrac + ) + pg_loss_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = pg_loss + entropy_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = entropy.mean() + ratio_stats[ppo_epoch_idx, minibatch_idx, gradient_accumulation_idx] = new_ratio.mean() + gradient_accumulation_idx += 1 + minibatch_idx += 1 + + # del everything and empty cache + # fmt: off + del ( + output, logits, new_logprobs, logprobs_diff, ratio, pg_losses, + pg_losses2, pg_loss, loss, pg_clipfrac, prob_dist, entropy, approxkl, + mb_advantage, mb_responses, mb_query_responses, mb_logprobs, + ) + # fmt: on + empty_cache() + + # Compute metrics + with torch.no_grad(): + mean_kl = kl.sum(1).mean() + mean_entropy = (-logprobs).sum(1).mean() + mean_non_score_reward = non_score_reward.mean() + eps = int(self.state.episode / (time.time() - start_time)) + metrics = {} + metrics["eps"] = eps + metrics["objective/kl"] = self.accelerator.gather_for_metrics(mean_kl).mean().item() + metrics["objective/entropy"] = self.accelerator.gather_for_metrics(mean_entropy).mean().item() + metrics["objective/non_score_reward"] = ( + self.accelerator.gather_for_metrics(mean_non_score_reward).mean().item() + ) + metrics["objective/rlhf_reward"] = self.accelerator.gather_for_metrics(rlhf_reward).mean().item() + metrics["objective/scores"] = self.accelerator.gather_for_metrics(scores.mean()).mean().item() + metrics["policy/approxkl_avg"] = self.accelerator.gather_for_metrics(approxkl_stats).mean().item() + metrics["policy/clipfrac_avg"] = self.accelerator.gather_for_metrics(pg_clipfrac_stats).mean().item() + metrics["loss/policy_avg"] = self.accelerator.gather_for_metrics(pg_loss_stats).mean().item() + metrics["val/clipfrac_avg"] = self.accelerator.gather_for_metrics(vf_clipfrac_stats).mean().item() + metrics["policy/entropy_avg"] = self.accelerator.gather_for_metrics(entropy_stats).mean().item() + metrics["val/ratio"] = self.accelerator.gather_for_metrics(ratio_stats).mean().item() + metrics["val/ratio_var"] = self.accelerator.gather_for_metrics(ratio_stats).var().item() + metrics["val/num_eos_tokens"] = (responses == processing_class.eos_token_id).sum().item() + metrics["lr"] = self.lr_scheduler.get_last_lr()[0] + metrics["episode"] = self.state.episode + self.state.epoch = self.state.episode / (args.rloo_k * self.train_dataset_len) # used by self.log + self.log(metrics) + del kl, mean_kl, mean_entropy, scores + + self.lr_scheduler.step() + self.state.global_step += 1 + self.control = self.callback_handler.on_step_end(args, self.state, self.control) + if self.control.should_save: + self._save_checkpoint(model, trial=None) + self.control = self.callback_handler.on_save(self.args, self.state, self.control) + empty_cache() + gc.collect() + + if args.num_sample_generations > 0 and (update - 1) % self.sample_generations_freq == 0: + self.generate_completions(sampling=True) + + # HF trainer specifics + self.control = self.callback_handler.on_train_end(args, self.state, self.control) + if self.control.should_save: + self._save_checkpoint(model, trial=None, metrics=None) + self.control = self.callback_handler.on_save(self.args, self.state, self.control) + + def generate_completions(self, sampling: bool = False): + args = self.args + processing_class = self.processing_class + generation_config = GenerationConfig( + max_new_tokens=self.args.response_length, + temperature=(0.01 + 1e-7), + top_k=0.0, + top_p=1.0, + do_sample=True, + ) + + table = defaultdict(list) + with unwrap_model_for_generation( + self.model, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation + ) as unwrapped_model: + for batch in self.eval_dataloader: + query = batch["input_ids"] + with torch.no_grad(): + context_length = query.shape[1] + query_response, _ = batch_generation( + unwrapped_model, + query, + query.shape[0], + processing_class.pad_token_id, + generation_config, + ) + response = query_response[:, context_length:] + postprocessed_response = response + if args.stop_token_id is not None: # handle the edge case when stop_token_id exists but is 0 + postprocessed_response = truncate_response( + args.stop_token_id, processing_class.pad_token_id, response + ) + table["query"].extend( + gather_object(processing_class.batch_decode(query, skip_special_tokens=True)) + ) + table["model response"].extend( + gather_object(processing_class.batch_decode(postprocessed_response)) + ) + + postprocessed_query_response = torch.cat((query, postprocessed_response), 1) + + if isinstance(self.reward_model, nn.Module): + _, score, _ = get_reward( + self.reward_model, + postprocessed_query_response, + processing_class.pad_token_id, + context_length, + ) + else: + score = torch.tensor( + self.reward_model( + processing_class.batch_decode(postprocessed_query_response, skip_special_tokens=True) + ), + dtype=torch.float, + ).to(postprocessed_query_response.device) + table["score"].extend(self.accelerator.gather_for_metrics(score).float().cpu().numpy()) + + if sampling: + break + df = pd.DataFrame(table) + + if self.accelerator.is_main_process: + if is_rich_available(): + print_rich_table(df.iloc[0 : 0 + 5]) + if "wandb" in args.report_to: + import wandb + + if wandb.run is not None: + wandb.log({"completions": wandb.Table(dataframe=df)}) + + if "comet_ml" in args.report_to: + log_table_to_comet_experiment( + name="completions.csv", + table=df, + ) + + # Ensure the model card is saved along with the checkpoint + def _save_checkpoint(self, model, trial): + if self.args.hub_model_id is None: + model_name = Path(self.args.output_dir).name + else: + model_name = self.args.hub_model_id.split("/")[-1] + self.create_model_card(model_name=model_name) + super()._save_checkpoint(model, trial) + + def create_model_card( + self, + model_name: Optional[str] = None, + dataset_name: Optional[str] = None, + tags: Union[str, list[str], None] = None, + ): + """ + Creates a draft of a model card using the information available to the `Trainer`. + + Args: + model_name (`str` or `None`, *optional*, defaults to `None`): + Name of the model. + dataset_name (`str` or `None`, *optional*, defaults to `None`): + Name of the dataset used for training. + tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`): + Tags to be associated with the model card. + """ + if not self.is_world_process_zero(): + return + + if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path): + base_model = self.model.config._name_or_path + else: + base_model = None + + # normalize `tags` to a mutable set + if tags is None: + tags = set() + elif isinstance(tags, str): + tags = {tags} + else: + tags = set(tags) + + if hasattr(self.model.config, "unsloth_version"): + tags.add("unsloth") + + tags.update(self._tag_names) + + citation = textwrap.dedent("""\ + @inproceedings{ahmadian2024back, + title = {{Back to Basics: Revisiting REINFORCE-Style Optimization for Learning from Human Feedback in LLMs}}, + author = {Arash Ahmadian and Chris Cremer and Matthias Gall{\'{e}} and Marzieh Fadaee and Julia Kreutzer and Olivier Pietquin and Ahmet {\"{U}}st{\"{u}}n and Sara Hooker}, + year = 2024, + booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), {ACL} 2024, Bangkok, Thailand, August 11-16, 2024}, + publisher = {Association for Computational Linguistics}, + pages = {12248--12267}, + editor = {Lun{-}Wei Ku and Andre Martins and Vivek Srikumar}, + }""") + + model_card = generate_model_card( + base_model=base_model, + model_name=model_name, + hub_model_id=self.hub_model_id, + dataset_name=dataset_name, + tags=tags, + wandb_url=wandb.run.get_url() if is_wandb_available() and wandb.run is not None else None, + comet_url=get_comet_experiment_url(), + trainer_name="RLOO", + trainer_citation=citation, + paper_title="Back to Basics: Revisiting REINFORCE-Style Optimization for Learning from Human Feedback in LLMs", + paper_id="2402.14740", + ) + + model_card.save(os.path.join(self.args.output_dir, "README.md")) +class UnslothRLOOTrainer(_UnslothRLOOTrainer): + """ + + """ + def __init__( + self, + config, + processing_class, + policy, + ref_policy, + reward_model, + train_dataset, + data_collator = None, + eval_dataset = None, + callbacks = None, + **kwargs + ): + if args is None: args = UnslothRLOOConfig() + _output_logits = False + if locals().get('compute_metrics', None) is not None: _output_logits = True + if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True + if _output_logits: + os.environ['UNSLOTH_RETURN_LOGITS'] = '1' + if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'): + pass + else: + model_max_seq_length = getattr(model, 'max_seq_length', None) + args_max_seq_length = getattr(args, 'max_seq_length', None) + if args_max_seq_length is None and model_max_seq_length is not None: + max_seq_length = model.max_seq_length + if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length + if model is not None and hasattr(model, 'for_training'): + model.for_training() + if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right' + if 'processing_class' in locals(): + if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right' + if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right' + __tokenizer = processing_class if 'processing_class' in locals() else tokenizer + from unsloth_zoo.vision_utils import UnslothVisionDataCollator + if not isinstance(data_collator, UnslothVisionDataCollator): + if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer, mlm = False, mlm_probability = 0.0) + elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names: + data_collator = DataCollatorForSeq2Seq(__tokenizer) + else: + if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False + if hasattr(args, 'dataset_text_field'): args.dataset_text_field = '' + if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True} + if not isinstance(data_collator, UnslothVisionDataCollator): + if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'): + if isinstance(data_collator, DataCollatorForSeq2Seq): + data_collator = DataCollatorForSeq2Seq(__tokenizer.tokenizer) + else: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer.tokenizer, mlm = False, mlm_probability = 0.0) + other_metrics = [] + + from unsloth_zoo.logging_utils import PatchRLStatistics + PatchRLStatistics('rloo_trainer', other_metrics) + + super().__init__( + config = config, + processing_class = processing_class, + policy = policy, + ref_policy = ref_policy, + reward_model = reward_model, + train_dataset = train_dataset, + data_collator = data_collator, + eval_dataset = eval_dataset, + callbacks = callbacks,**kwargs) + if hasattr(self, 'neftune_hook_handle'): + self.neftune_hook_handle.remove() + if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle + if getattr(args, 'neftune_noise_alpha', None) is not None: + model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha + pass + +pass diff --git a/unsloth_compiled_cache/UnslothRewardTrainer.py b/unsloth_compiled_cache/UnslothRewardTrainer.py new file mode 100644 index 0000000000000000000000000000000000000000..ce6b18e3ff0c16da66e2e6f05ad723b368a66830 --- /dev/null +++ b/unsloth_compiled_cache/UnslothRewardTrainer.py @@ -0,0 +1,844 @@ +""" +2025.7.4 +2025.7.3 +4.53.2 +0.19.1 +__UNSLOTH_VERSIONING__ +""" +from torch import Tensor +import torch +import torch.nn as nn +from torch.nn import functional as F +from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable +from trl.trainer.reward_trainer import (Any, BaseImageProcessor, Callable, DataCollator, Dataset, EvalPrediction, FeatureExtractionMixin, FrozenInstanceError, Optional, PartialState, Path, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, RewardConfig, RewardDataCollatorWithPadding, RewardTrainer, Trainer, TrainerCallback, Union, _tokenize, compute_accuracy, decode_and_strip_padding, defaultdict, disable_dropout_in_model, gather_object, generate_model_card, get_comet_experiment_url, inspect, is_peft_available, is_rich_available, is_wandb_available, log_table_to_comet_experiment, maybe_apply_chat_template, nested_detach, nn, os, pd, prepare_model_for_kbit_training, print_rich_table, replace, torch, wandb, warnings, Optional, PeftModel, PreTrainedModel, Trainer, is_peft_available, os, torch) + + +import os +from typing import * +from dataclasses import dataclass, field +from packaging.version import Version +import torch +import numpy as np +from contextlib import nullcontext +from torch.nn import functional as F +from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling + +torch_compile_options = { + "epilogue_fusion" : True, + "max_autotune" : False, + "shape_padding" : True, + "trace.enabled" : False, + "triton.cudagraphs" : False, +} + +@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) +def selective_log_softmax(logits, index): + logits = logits.to(torch.float32) + selected_logits = torch.gather(logits, dim = -1, index = index.unsqueeze(-1)).squeeze(-1) + # loop to reduce peak mem consumption + # logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits]) + logsumexp_values = torch.logsumexp(logits, dim = -1) + per_token_logps = selected_logits - logsumexp_values # log_softmax(x_i) = x_i - logsumexp(x) + return per_token_logps +@dataclass +class UnslothRewardConfig(RewardConfig): + """ + + Configuration class for the [`RewardTrainer`]. + + This class includes only the parameters that are specific to Reward training. For a full list of training + arguments, please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this + class may differ from those in [`~transformers.TrainingArguments`]. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + max_length (`int` or `None`, *optional*, defaults to `1024`): + Maximum length of the sequences (prompt + completion) in the batch, filters out entries that exceed the + limit. This argument is required if you want to use the default data collator. + disable_dropout (`bool`, *optional*, defaults to `True`): + Whether to disable dropout in the model. + dataset_num_proc (`int`, *optional*, defaults to `None`): + Number of processes to use for processing the dataset. + center_rewards_coefficient (`float`, *optional*, defaults to `None`): + Coefficient to incentivize the reward model to output mean-zero rewards (proposed by + https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`. + remove_unused_columns (`bool`, *optional*, defaults to `False`): + Whether to remove the columns that are not used by the model's forward pass. Can be `True` only if the + dataset is pretokenized. + + """ + vllm_sampling_params: Optional[Any] = field( + default = None, + metadata = {'help': 'vLLM SamplingParams'}, + ) + unsloth_num_chunks : Optional[int] = field( + default = -1, + metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'}, + ) + def __init__( + self, + output_dir = None, + overwrite_output_dir = None, + do_train = False, + do_eval = False, + do_predict = False, + eval_strategy = 'no', + prediction_loss_only = False, + per_device_train_batch_size = 4, + per_device_eval_batch_size = 4, + per_gpu_train_batch_size = None, + per_gpu_eval_batch_size = None, + gradient_accumulation_steps = 2, + eval_accumulation_steps = 2, + eval_delay = 0, + torch_empty_cache_steps = 250, + learning_rate = 5e-05, + weight_decay = 0.01, + adam_beta1 = 0.9, + adam_beta2 = 0.999, + adam_epsilon = 1e-08, + max_grad_norm = 1.0, + num_train_epochs = 3.0, + max_steps = -1, + lr_scheduler_type = 'linear', + warmup_ratio = 0.1, + warmup_steps = 0, + log_level = 'passive', + log_level_replica = 'warning', + log_on_each_node = True, + logging_dir = None, + logging_strategy = 'steps', + logging_first_step = False, + logging_steps = 1, + logging_nan_inf_filter = False, + save_strategy = 'steps', + save_steps = 500, + save_total_limit = None, + save_safetensors = True, + save_on_each_node = False, + save_only_model = False, + restore_callback_states_from_checkpoint = False, + no_cuda = False, + use_cpu = False, + use_mps_device = False, + seed = 3407, + data_seed = 3407, + jit_mode_eval = False, + use_ipex = False, + bf16 = False, + fp16 = False, + fp16_opt_level = 'O1', + half_precision_backend = 'auto', + bf16_full_eval = False, + fp16_full_eval = False, + tf32 = None, + local_rank = -1, + ddp_backend = None, + tpu_num_cores = None, + tpu_metrics_debug = False, + debug = '', + dataloader_drop_last = False, + eval_steps = None, + dataloader_num_workers = 0, + dataloader_prefetch_factor = None, + past_index = -1, + run_name = None, + disable_tqdm = None, + remove_unused_columns = False, + label_names = None, + load_best_model_at_end = False, + metric_for_best_model = None, + greater_is_better = None, + ignore_data_skip = False, + fsdp = '', + fsdp_min_num_params = 0, + fsdp_config = None, + fsdp_transformer_layer_cls_to_wrap = None, + accelerator_config = None, + deepspeed = None, + label_smoothing_factor = 0.0, + optim = 'adamw_8bit', + optim_args = None, + adafactor = False, + group_by_length = False, + length_column_name = 'length', + report_to = None, + ddp_find_unused_parameters = None, + ddp_bucket_cap_mb = None, + ddp_broadcast_buffers = None, + dataloader_pin_memory = True, + dataloader_persistent_workers = False, + skip_memory_metrics = True, + use_legacy_prediction_loop = False, + push_to_hub = False, + resume_from_checkpoint = None, + hub_model_id = None, + hub_strategy = 'every_save', + hub_token = None, + hub_private_repo = None, + hub_always_push = False, + hub_revision = None, + gradient_checkpointing = False, + gradient_checkpointing_kwargs = None, + include_inputs_for_metrics = False, + eval_do_concat_batches = True, + fp16_backend = 'auto', + push_to_hub_model_id = None, + push_to_hub_organization = None, + push_to_hub_token = None, + mp_parameters = '', + auto_find_batch_size = False, + full_determinism = False, + torchdynamo = None, + ray_scope = 'last', + ddp_timeout = 1800, + torch_compile = False, + torch_compile_backend = None, + torch_compile_mode = None, + include_tokens_per_second = False, + include_num_input_tokens_seen = False, + neftune_noise_alpha = None, + optim_target_modules = None, + batch_eval_metrics = False, + eval_on_start = False, + use_liger_kernel = False, + liger_kernel_config = None, + eval_use_gather_object = False, + average_tokens_across_devices = True, + max_length = 1024, + disable_dropout = True, + dataset_num_proc = None, + center_rewards_coefficient = None, + vllm_sampling_params = None, + unsloth_num_chunks = -1, + **kwargs, + ): + if learning_rate < 1e-7: raise FloatingPointError(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!') + if learning_rate > 1: raise OverflowError(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!') + if output_dir is None and save_strategy == 'steps' and save_steps == 500: + output_dir = 'unsloth_training_checkpoints' + save_strategy = 'no' + if dataset_num_proc is None: + from multiprocessing import cpu_count + dataset_num_proc = cpu_count() + + super().__init__( + output_dir = output_dir, + overwrite_output_dir = overwrite_output_dir, + do_train = do_train, + do_eval = do_eval, + do_predict = do_predict, + eval_strategy = eval_strategy, + prediction_loss_only = prediction_loss_only, + per_device_train_batch_size = per_device_train_batch_size, + per_device_eval_batch_size = per_device_eval_batch_size, + per_gpu_train_batch_size = per_gpu_train_batch_size, + per_gpu_eval_batch_size = per_gpu_eval_batch_size, + gradient_accumulation_steps = gradient_accumulation_steps, + eval_accumulation_steps = eval_accumulation_steps, + eval_delay = eval_delay, + torch_empty_cache_steps = torch_empty_cache_steps, + learning_rate = learning_rate, + weight_decay = weight_decay, + adam_beta1 = adam_beta1, + adam_beta2 = adam_beta2, + adam_epsilon = adam_epsilon, + max_grad_norm = max_grad_norm, + num_train_epochs = num_train_epochs, + max_steps = max_steps, + lr_scheduler_type = lr_scheduler_type, + warmup_ratio = warmup_ratio, + warmup_steps = warmup_steps, + log_level = log_level, + log_level_replica = log_level_replica, + log_on_each_node = log_on_each_node, + logging_dir = logging_dir, + logging_strategy = logging_strategy, + logging_first_step = logging_first_step, + logging_steps = logging_steps, + logging_nan_inf_filter = logging_nan_inf_filter, + save_strategy = save_strategy, + save_steps = save_steps, + save_total_limit = save_total_limit, + save_safetensors = save_safetensors, + save_on_each_node = save_on_each_node, + save_only_model = save_only_model, + restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint, + no_cuda = no_cuda, + use_cpu = use_cpu, + use_mps_device = use_mps_device, + seed = seed, + data_seed = data_seed, + jit_mode_eval = jit_mode_eval, + use_ipex = use_ipex, + bf16 = bf16, + fp16 = fp16, + fp16_opt_level = fp16_opt_level, + half_precision_backend = half_precision_backend, + bf16_full_eval = bf16_full_eval, + fp16_full_eval = fp16_full_eval, + tf32 = tf32, + local_rank = local_rank, + ddp_backend = ddp_backend, + tpu_num_cores = tpu_num_cores, + tpu_metrics_debug = tpu_metrics_debug, + debug = debug, + dataloader_drop_last = dataloader_drop_last, + eval_steps = eval_steps, + dataloader_num_workers = dataloader_num_workers, + dataloader_prefetch_factor = dataloader_prefetch_factor, + past_index = past_index, + run_name = run_name, + disable_tqdm = disable_tqdm, + remove_unused_columns = remove_unused_columns, + label_names = label_names, + load_best_model_at_end = load_best_model_at_end, + metric_for_best_model = metric_for_best_model, + greater_is_better = greater_is_better, + ignore_data_skip = ignore_data_skip, + fsdp = fsdp, + fsdp_min_num_params = fsdp_min_num_params, + fsdp_config = fsdp_config, + fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap, + accelerator_config = accelerator_config, + deepspeed = deepspeed, + label_smoothing_factor = label_smoothing_factor, + optim = optim, + optim_args = optim_args, + adafactor = adafactor, + group_by_length = group_by_length, + length_column_name = length_column_name, + report_to = report_to, + ddp_find_unused_parameters = ddp_find_unused_parameters, + ddp_bucket_cap_mb = ddp_bucket_cap_mb, + ddp_broadcast_buffers = ddp_broadcast_buffers, + dataloader_pin_memory = dataloader_pin_memory, + dataloader_persistent_workers = dataloader_persistent_workers, + skip_memory_metrics = skip_memory_metrics, + use_legacy_prediction_loop = use_legacy_prediction_loop, + push_to_hub = push_to_hub, + resume_from_checkpoint = resume_from_checkpoint, + hub_model_id = hub_model_id, + hub_strategy = hub_strategy, + hub_token = hub_token, + hub_private_repo = hub_private_repo, + hub_always_push = hub_always_push, + hub_revision = hub_revision, + gradient_checkpointing = gradient_checkpointing, + gradient_checkpointing_kwargs = gradient_checkpointing_kwargs, + include_inputs_for_metrics = include_inputs_for_metrics, + eval_do_concat_batches = eval_do_concat_batches, + fp16_backend = fp16_backend, + push_to_hub_model_id = push_to_hub_model_id, + push_to_hub_organization = push_to_hub_organization, + push_to_hub_token = push_to_hub_token, + mp_parameters = mp_parameters, + auto_find_batch_size = auto_find_batch_size, + full_determinism = full_determinism, + torchdynamo = torchdynamo, + ray_scope = ray_scope, + ddp_timeout = ddp_timeout, + torch_compile = torch_compile, + torch_compile_backend = torch_compile_backend, + torch_compile_mode = torch_compile_mode, + include_tokens_per_second = include_tokens_per_second, + include_num_input_tokens_seen = include_num_input_tokens_seen, + neftune_noise_alpha = neftune_noise_alpha, + optim_target_modules = optim_target_modules, + batch_eval_metrics = batch_eval_metrics, + eval_on_start = eval_on_start, + use_liger_kernel = use_liger_kernel, + liger_kernel_config = liger_kernel_config, + eval_use_gather_object = eval_use_gather_object, + average_tokens_across_devices = average_tokens_across_devices, + max_length = max_length, + disable_dropout = disable_dropout, + dataset_num_proc = dataset_num_proc, + center_rewards_coefficient = center_rewards_coefficient,**kwargs) + self.vllm_sampling_params = vllm_sampling_params + self.unsloth_num_chunks = unsloth_num_chunks +pass + +class _UnslothRewardTrainer(Trainer): + _tag_names = ["trl", "reward-trainer"] + + def __init__( + self, + model: Optional[Union[PreTrainedModel, nn.Module]] = None, + args: Optional[RewardConfig] = None, + data_collator: Optional[DataCollator] = None, + train_dataset: Optional[Dataset] = None, + eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None, + processing_class: Optional[ + Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin] + ] = None, + model_init: Optional[Callable[[], PreTrainedModel]] = None, + compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None, + callbacks: Optional[list[TrainerCallback]] = None, + optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = ( + None, + None, + ), + preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, + peft_config: Optional[dict] = None, + ): + """ + Initialize RewardTrainer. + + Args: + model (`transformers.PreTrainedModel`): + The model to train, preferably an `AutoModelForSequenceClassification`. + args (`RewardConfig`): + The arguments to use for training. + data_collator (`transformers.DataCollator`): + The data collator to use for training. If None is specified, the default data collator + (`RewardDataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of + the sequences in the batch, given a dataset of paired sequences. + train_dataset (`datasets.Dataset`): + The dataset to use for training. + eval_dataset (`datasets.Dataset`): + The dataset to use for evaluation. + processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*): + Processing class used to process the data. If provided, will be used to automatically process the + inputs for the model, and it will be saved along the model to make it easier to rerun an interrupted + training or reuse the fine-tuned model. + model_init (`Callable[[], transformers.PreTrainedModel]`): + The model initializer to use for training. If None is specified, the default model initializer will be + used. + compute_metrics (`Callable[[transformers.EvalPrediction], dict]`, *optional* defaults to `compute_accuracy`): + The metrics to use for evaluation. If no metrics are specified, the default metric (`compute_accuracy`) + will be used. + callbacks (`list[transformers.TrainerCallback]`): + The callbacks to use for training. + optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`): + The optimizer and scheduler to use for training. + preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`): + The function to use to preprocess the logits before computing the metrics. + peft_config (`dict`, defaults to `None`): + The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped + in a PEFT model. + """ + if not is_peft_available() and peft_config is not None: + raise ValueError( + "PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models" + ) + elif is_peft_available() and peft_config is not None: + if not isinstance(model, PeftModel): + if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_quantized", False): + _supports_gc_kwargs = "gradient_checkpointing_kwargs" in list( + inspect.signature(prepare_model_for_kbit_training).parameters + ) + + prepare_model_kwargs = {"use_gradient_checkpointing": args.gradient_checkpointing} + + if not _supports_gc_kwargs and args.gradient_checkpointing_kwargs is not None: + warnings.warn( + "You passed `gradient_checkpointing_kwargs` in the trainer's kwargs, but your peft version does not support it. " + "please update to the latest version of peft to use `gradient_checkpointing_kwargs`.", + UserWarning, + ) + elif _supports_gc_kwargs and args.gradient_checkpointing_kwargs is not None: + prepare_model_kwargs["gradient_checkpointing_kwargs"] = args.gradient_checkpointing_kwargs + + model = prepare_model_for_kbit_training(model, **prepare_model_kwargs) + + model = model + + # Disable dropout in the model + if args.disable_dropout: + disable_dropout_in_model(model) + + if compute_metrics is None: + compute_metrics = compute_accuracy + + if data_collator is None: + if processing_class is None: + raise ValueError( + "A processing_class must be specified when using the default RewardDataCollatorWithPadding" + ) + + max_length = args.max_length + + data_collator = RewardDataCollatorWithPadding(processing_class) + + if args.remove_unused_columns: + try: # for bc before https://github.com/huggingface/transformers/pull/25435 + args.remove_unused_columns = False + except FrozenInstanceError: + args = replace(args, remove_unused_columns=False) + # warn users + warnings.warn( + "When using RewardDataCollatorWithPadding, you should set `remove_unused_columns=False` in your RewardConfig" + " we have set it for you, but you should do it yourself in the future.", + UserWarning, + ) + + self.use_reward_data_collator = True + else: + self.use_reward_data_collator = False + + # The trainer estimates the number of FLOPs [floating-point operations] using the number of elements in the + # input tensor associated with the key "input_ids". However, in Reward, the sampled data does not include the + # "input_ids" key. Instead, the available keys are "input_ids_chosen" and "input_ids_rejected". As a result, + # the trainer issues the warning: "Could not estimate the number of tokens of the input, floating-point + # operations will not be computed." To suppress this warning, we set the "estimate_tokens" key in the model's + # "warnings_issued" dictionary to True. This acts as a flag to indicate that the warning has already been + # issued. + model.warnings_issued["estimate_tokens"] = True + + if "input_ids_chosen" not in train_dataset.column_names: + with PartialState().main_process_first(): + fn_kwargs = {"tokenizer": processing_class} + train_dataset = train_dataset.map(maybe_apply_chat_template, fn_kwargs={"tokenizer": processing_class}) + train_dataset = train_dataset.map( + _tokenize, + batched=True, + fn_kwargs=fn_kwargs, + num_proc=args.dataset_num_proc, + ) + # This filter is important because otherwise you get samples that exceed the model's context length and + # get truncated => noisy signal the chosen/rejected label gets lost. The downside is that the + # user might get surprised if N samples are missing from training. + train_dataset = train_dataset.filter( + lambda x: len(x["input_ids_chosen"]) <= max_length and len(x["input_ids_rejected"]) <= max_length, + num_proc=args.dataset_num_proc, + ) + if eval_dataset is not None: + eval_dataset = eval_dataset.map( + maybe_apply_chat_template, fn_kwargs={"tokenizer": processing_class} + ) + eval_dataset = eval_dataset.map( + _tokenize, + fn_kwargs=fn_kwargs, + batched=True, + num_proc=args.dataset_num_proc, + ) + # This filter is important because otherwise you get samples that exceed the model's context length and + # get truncated => noisy signal the chosen/rejected label gets lost. The downside is that the + # user might get surprised if N samples are missing from training. + eval_dataset = eval_dataset.filter( + lambda x: len(x["input_ids_chosen"]) <= max_length + and len(x["input_ids_rejected"]) <= max_length, + num_proc=args.dataset_num_proc, + ) + + super().__init__( + model=model, + args=args, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + processing_class=processing_class, + model_init=model_init, + compute_metrics=compute_metrics, + callbacks=callbacks, + optimizers=optimizers, + preprocess_logits_for_metrics=preprocess_logits_for_metrics, + ) + + # Add tags for models that have been loaded with the correct transformers version + if hasattr(self.model, "add_model_tags"): + self.model.add_model_tags(self._tag_names) + + def compute_loss( + self, + model: Union[PreTrainedModel, nn.Module], + inputs: dict[str, Union[torch.Tensor, Any]], + return_outputs=False, + num_items_in_batch=None, + ) -> Union[torch.Tensor, tuple[torch.Tensor, dict[str, torch.Tensor]]]: + rewards_chosen = model( + input_ids=inputs["input_ids_chosen"], + attention_mask=inputs["attention_mask_chosen"], + return_dict=True, + )["logits"] + rewards_rejected = model( + input_ids=inputs["input_ids_rejected"], + attention_mask=inputs["attention_mask_rejected"], + return_dict=True, + )["logits"] + # calculate loss, optionally modulate with margin + if "margin" in inputs: + loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected - inputs["margin"]).mean() + else: + loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected).mean() + + if self.args.center_rewards_coefficient is not None: + loss += self.args.center_rewards_coefficient * torch.mean((rewards_chosen + rewards_rejected) ** 2) + + if return_outputs: + return loss, { + "rewards_chosen": rewards_chosen, + "rewards_rejected": rewards_rejected, + } + return loss + + def prediction_step( + self, + model: Union[PreTrainedModel, nn.Module], + inputs: dict[str, Union[torch.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[list[str]] = None, + ) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: + inputs = self._prepare_inputs(inputs) + if ignore_keys is None: + if hasattr(self.model, "config"): + ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", []) + else: + ignore_keys = [] + + with torch.no_grad(): + loss, logits_dict = self.compute_loss(model, inputs, return_outputs=True) + + if prediction_loss_only: + return (loss, None, None) + + loss = loss.detach() + logits = tuple(v for k, v in logits_dict.items() if k not in ignore_keys) + logits = nested_detach(logits) + # Stack accepted against rejected, mean over logits + # and softmax to get preferences between accepted and rejected to sum to 1 + logits = torch.stack(logits).mean(dim=2).softmax(dim=0).T + + labels = torch.zeros(logits.shape[0]) + labels = self._prepare_inputs(labels) + + return loss, logits, labels + + def evaluate(self, *args, **kwargs): + num_print_samples = kwargs.pop("num_print_samples", 4) + self.visualize_samples(num_print_samples) + return super().evaluate(*args, **kwargs) + + def visualize_samples(self, num_print_samples: int): + """ + Visualize the reward model logits prediction + + Args: + num_print_samples (`int`, defaults to `4`): + The number of samples to print. Set to `-1` to print all samples. + """ + eval_dataloader = self.get_eval_dataloader() + table = defaultdict(list) + for _, inputs in enumerate(eval_dataloader): + _, logits, _ = self.prediction_step(self.model, inputs, prediction_loss_only=False) + chosen_text = decode_and_strip_padding(inputs["input_ids_chosen"], self.processing_class) + rejected_text = decode_and_strip_padding(inputs["input_ids_rejected"], self.processing_class) + table["chosen_text"].extend(gather_object(chosen_text)) + table["rejected_text"].extend(gather_object(rejected_text)) + table["logits"].extend( + gather_object([[round(inner_item, 4) for inner_item in item] for item in logits.tolist()]) + ) + if num_print_samples >= 0 and len(table["chosen_text"]) >= num_print_samples: + break + df = pd.DataFrame(table) + if self.accelerator.process_index == 0: + if is_rich_available(): + print_rich_table(df[:num_print_samples]) + if "wandb" in self.args.report_to: + import wandb + + if wandb.run is not None: + wandb.log({"completions": wandb.Table(dataframe=df)}) + + if "comet_ml" in self.args.report_to: + log_table_to_comet_experiment( + name="completions.csv", + table=df, + ) + + # Ensure the model card is saved along with the checkpoint + def _save_checkpoint(self, model, trial): + if self.args.hub_model_id is None: + model_name = Path(self.args.output_dir).name + else: + model_name = self.args.hub_model_id.split("/")[-1] + self.create_model_card(model_name=model_name) + super()._save_checkpoint(model, trial) + + def create_model_card( + self, + model_name: Optional[str] = None, + dataset_name: Optional[str] = None, + tags: Union[str, list[str], None] = None, + ): + """ + Creates a draft of a model card using the information available to the `Trainer`. + + Args: + model_name (`str` or `None`, *optional*, defaults to `None`): + Name of the model. + dataset_name (`str` or `None`, *optional*, defaults to `None`): + Name of the dataset used for training. + tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`): + Tags to be associated with the model card. + """ + if not self.is_world_process_zero(): + return + + if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path): + base_model = self.model.config._name_or_path + else: + base_model = None + + # normalize `tags` to a mutable set + if tags is None: + tags = set() + elif isinstance(tags, str): + tags = {tags} + else: + tags = set(tags) + + if hasattr(self.model.config, "unsloth_version"): + tags.add("unsloth") + + tags.update(self._tag_names) + + model_card = generate_model_card( + base_model=base_model, + model_name=model_name, + hub_model_id=self.hub_model_id, + dataset_name=dataset_name, + tags=tags, + wandb_url=wandb.run.get_url() if is_wandb_available() and wandb.run is not None else None, + comet_url=get_comet_experiment_url(), + trainer_name="Reward", + ) + + model_card.save(os.path.join(self.args.output_dir, "README.md")) +class UnslothRewardTrainer(_UnslothRewardTrainer): + """ + + """ + def __init__( + self, + model = None, + args = None, + data_collator = None, + train_dataset = None, + eval_dataset = None, + processing_class = None, + model_init = None, + compute_metrics = None, + callbacks = None, + preprocess_logits_for_metrics = None, + peft_config = None, + **kwargs + ): + if args is None: args = UnslothRewardConfig() + use_bf16 = getattr(args, 'bf16', False) + if type(use_bf16) is not bool: use_bf16 = False + use_fp16 = getattr(args, 'fp16', False) + if type(use_fp16) is not bool: use_fp16 = False + force_float32 = False + if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1': + print('Unsloth: Switching to float32 training since model cannot work with float16') + force_float32 = True + mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') + dtype = getattr(model.config, 'torch_dtype', None) + if dtype is None: dtype = model.get_input_embeddings().dtype + from unsloth_zoo.utils import _get_dtype + dtype = _get_dtype(dtype) + float16 = dtype == torch.float16 + if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`') + if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`') + if force_float32: + args.fp16 = False + args.bf16 = False + os.environ['ACCELERATE_MIXED_PRECISION'] = 'no' + elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32': + args.fp16 = float16 + args.bf16 = not float16 + os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16' + if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no': + args.eval_strategy = 'steps' + if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1 + ga_steps = getattr(args, 'gradient_accumulation_steps', None) + if ga_steps is not None and ga_steps > 1: + from transformers import __version__ as transformers_version + if Version(transformers_version) <= Version('4.45.2'): + print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n' + '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`') + if getattr(args, 'eval_strategy', 'no') != 'no': + eval_bsz = getattr(args, 'per_device_eval_batch_size', 8) + if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size + if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps + fp16_full_eval = getattr(args, 'fp16_full_eval', False) + if type(fp16_full_eval) is not bool: fp16_full_eval = False + bf16_full_eval = getattr(args, 'bf16_full_eval', False) + if type(bf16_full_eval) is not bool: bf16_full_eval = False + if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True + if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False + if force_float32: + args.bf16_full_eval = False + args.fp16_full_eval = False + elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16': + args.bf16_full_eval = True + args.fp16_full_eval = False + elif not bf16_full_eval and not fp16_full_eval: + args.bf16_full_eval = args.bf16 + args.fp16_full_eval = args.fp16 + _output_logits = False + if locals().get('compute_metrics', None) is not None: _output_logits = True + if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True + if _output_logits: + os.environ['UNSLOTH_RETURN_LOGITS'] = '1' + if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'): + pass + else: + model_max_seq_length = getattr(model, 'max_seq_length', None) + args_max_seq_length = getattr(args, 'max_seq_length', None) + if args_max_seq_length is None and model_max_seq_length is not None: + max_seq_length = model.max_seq_length + if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length + if model is not None and hasattr(model, 'for_training'): + model.for_training() + if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right' + if 'processing_class' in locals(): + if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right' + if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right' + __tokenizer = processing_class if 'processing_class' in locals() else tokenizer + from unsloth_zoo.vision_utils import UnslothVisionDataCollator + if not isinstance(data_collator, UnslothVisionDataCollator): + if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer, mlm = False, mlm_probability = 0.0) + elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names: + data_collator = DataCollatorForSeq2Seq(__tokenizer) + else: + if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False + if hasattr(args, 'dataset_text_field'): args.dataset_text_field = '' + if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True} + if not isinstance(data_collator, UnslothVisionDataCollator): + if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'): + if isinstance(data_collator, DataCollatorForSeq2Seq): + data_collator = DataCollatorForSeq2Seq(__tokenizer.tokenizer) + else: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer.tokenizer, mlm = False, mlm_probability = 0.0) + other_metrics = [] + + from unsloth_zoo.logging_utils import PatchRLStatistics + PatchRLStatistics('reward_trainer', other_metrics) + + super().__init__( + model = model, + args = args, + data_collator = data_collator, + train_dataset = train_dataset, + eval_dataset = eval_dataset, + processing_class = processing_class, + model_init = model_init, + compute_metrics = compute_metrics, + callbacks = callbacks, + preprocess_logits_for_metrics = preprocess_logits_for_metrics, + peft_config = peft_config,**kwargs) + if hasattr(self, 'neftune_hook_handle'): + self.neftune_hook_handle.remove() + if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle + if getattr(args, 'neftune_noise_alpha', None) is not None: + model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha + pass + +pass diff --git a/unsloth_compiled_cache/UnslothSFTTrainer.py b/unsloth_compiled_cache/UnslothSFTTrainer.py new file mode 100644 index 0000000000000000000000000000000000000000..5079a30f4fadd2ad06e35a22faa846d08d73b2ca --- /dev/null +++ b/unsloth_compiled_cache/UnslothSFTTrainer.py @@ -0,0 +1,1206 @@ +""" +2025.7.4 +2025.7.3 +4.53.2 +0.19.1 +__UNSLOTH_VERSIONING__ +""" +from torch import Tensor +import torch +import torch.nn as nn +from torch.nn import functional as F +from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable +from trl.trainer.sft_trainer import (Any, AutoModelForCausalLM, AutoTokenizer, BaseImageProcessor, Callable, ConstantLengthDataset, DataCollator, DataCollatorForLanguageModeling, Dataset, EvalPrediction, FeatureExtractionMixin, IterableDataset, Optional, Path, PeftConfig, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, SFTConfig, SFTTrainer, Trainer, TrainerCallback, TrainingArguments, Union, clone_chat_template, contextlib, dataclass, dataclasses, defaultdict, generate_model_card, get_act_offloading_ctx_manager, get_comet_experiment_url, get_peft_model, is_conversational, is_peft_available, is_wandb_available, nn, os, pad, peft, peft_module_casting_to_bf16, prepare_model_for_kbit_training, torch, version, wandb, warnings, Callable, ConstantLengthDataset, DataCollator, DataCollatorForLanguageModeling, Dataset, IterableDataset, Optional, Union, os, pad, Optional, PeftModel, PreTrainedModel, Trainer, is_peft_available, os, peft, torch, os) + + +import os +from typing import * +from dataclasses import dataclass, field +from packaging.version import Version +import torch +import numpy as np +from contextlib import nullcontext +from torch.nn import functional as F +from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling + +torch_compile_options = { + "epilogue_fusion" : True, + "max_autotune" : False, + "shape_padding" : True, + "trace.enabled" : False, + "triton.cudagraphs" : False, +} + +@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) +def selective_log_softmax(logits, index): + logits = logits.to(torch.float32) + selected_logits = torch.gather(logits, dim = -1, index = index.unsqueeze(-1)).squeeze(-1) + # loop to reduce peak mem consumption + # logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits]) + logsumexp_values = torch.logsumexp(logits, dim = -1) + per_token_logps = selected_logits - logsumexp_values # log_softmax(x_i) = x_i - logsumexp(x) + return per_token_logps +@dataclass +class UnslothSFTConfig(SFTConfig): + """ + + Configuration class for the [`SFTTrainer`]. + + This class includes only the parameters that are specific to SFT training. For a full list of training arguments, + please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may + differ from those in [`~transformers.TrainingArguments`]. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + > Parameters that control the model + + model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): + Keyword arguments for [`~transformers.AutoModelForCausalLM.from_pretrained`], used when the `model` + argument of the [`SFTTrainer`] is provided as a string. + chat_template_path (`str` or `None`, *optional*, defaults to `None`): + If specified, sets the model's chat template. This can either be the path to a tokenizer (local directory + or Hugging Face Hub model) or a direct path to a Jinja template file. When using a Jinja file, you must + ensure that any special tokens referenced in the template are added to the tokenizer and that the model's + embedding layer is resized accordingly. + + > Parameters that control the data preprocessing + + dataset_text_field (`str`, *optional*, defaults to `"text"`): + Name of the column that contains text data in the dataset. + dataset_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`): + Dictionary of optional keyword arguments for the dataset preparation. The only supported key is + `skip_prepare_dataset`. + dataset_num_proc (`int` or `None`, *optional*, defaults to `None`): + Number of processes to use for processing the dataset. + eos_token (`str` or `None`, *optional*, defaults to `None`): + Token used to indicate the end of a turn or sequence. If `None`, it defaults to + `processing_class.eos_token`. + pad_token (`int` or `None`, *optional*, defaults to `None`): + Token used for padding. If `None`, it defaults to `processing_class.pad_token`, or if that is also `None`, + it falls back to `processing_class.eos_token`. + max_length (`int` or `None`, *optional*, defaults to `1024`): + Maximum length of the tokenized sequence. Sequences longer than `max_length` are truncated from the right. + If `None`, no truncation is applied. When packing is enabled, this value sets the sequence length. + packing (`bool`, *optional*, defaults to `False`): + Whether to group multiple sequences into fixed-length blocks to improve computational efficiency and reduce + padding. Uses `max_length` to define sequence length. + packing_strategy (`str`, *optional*, defaults to `"ffd"`): + Strategy for packing sequences. Can be either `"ffd"` (first-fit decreasing, default), or `"wrapped"`. + padding_free (`bool`, *optional*, defaults to `False`): + Whether to perform forward passes without padding by flattening all sequences in the batch into a single + continuous sequence. This reduces memory usage by eliminating padding overhead. Currently, this is only + supported with the `flash_attention_2` attention implementation, which can efficiently handle the flattened + batch structure. When packing is enabled with strategy `"ffd"`, padding-free is enabled, regardless of the + value of this parameter. + pad_to_multiple_of (`int` or `None`, *optional*, defaults to `None`): + If set, the sequences will be padded to a multiple of this value. + eval_packing (`bool` or `None`, *optional*, defaults to `None`): + Whether to pack the eval dataset. If `None`, uses the same value as `packing`. + + > Parameters that control the training + + completion_only_loss (`bool` or `None`, *optional*, defaults to `None`): + Whether to compute loss only on the completion part of the sequence. If set to `True`, loss is computed + only on the completion, which is supported only for [prompt-completion](#prompt-completion) datasets. If + `False`, loss is computed on the entire sequence. If `None` (default), the behavior depends on the dataset: + loss is computed on the completion for [prompt-completion](#prompt-completion) datasets, and on the full + sequence for [language modeling](#language-modeling) datasets. + assistant_only_loss (`bool`, *optional*, defaults to `False`): + Whether to compute loss only on the assistant part of the sequence. If set to `True`, loss is computed + only on the assistant responses, which is supported only for [conversational](#conversational) datasets. If `False`, + loss is computed on the entire sequence. + activation_offloading (`bool`, *optional*, defaults to `False`): + Whether to offload the activations to the CPU. + + """ + vllm_sampling_params: Optional[Any] = field( + default = None, + metadata = {'help': 'vLLM SamplingParams'}, + ) + unsloth_num_chunks : Optional[int] = field( + default = -1, + metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'}, + ) + def __init__( + self, + output_dir = None, + overwrite_output_dir = None, + do_train = False, + do_eval = False, + do_predict = False, + eval_strategy = 'no', + prediction_loss_only = False, + per_device_train_batch_size = 4, + per_device_eval_batch_size = 4, + per_gpu_train_batch_size = None, + per_gpu_eval_batch_size = None, + gradient_accumulation_steps = 2, + eval_accumulation_steps = 2, + eval_delay = 0, + torch_empty_cache_steps = 250, + learning_rate = 5e-05, + weight_decay = 0.01, + adam_beta1 = 0.9, + adam_beta2 = 0.999, + adam_epsilon = 1e-08, + max_grad_norm = 1.0, + num_train_epochs = 3.0, + max_steps = -1, + lr_scheduler_type = 'linear', + warmup_ratio = 0.1, + warmup_steps = 0, + log_level = 'passive', + log_level_replica = 'warning', + log_on_each_node = True, + logging_dir = None, + logging_strategy = 'steps', + logging_first_step = False, + logging_steps = 1, + logging_nan_inf_filter = False, + save_strategy = 'steps', + save_steps = 500, + save_total_limit = None, + save_safetensors = True, + save_on_each_node = False, + save_only_model = False, + restore_callback_states_from_checkpoint = False, + no_cuda = False, + use_cpu = False, + use_mps_device = False, + seed = 3407, + data_seed = 3407, + jit_mode_eval = False, + use_ipex = False, + bf16 = False, + fp16 = False, + fp16_opt_level = 'O1', + half_precision_backend = 'auto', + bf16_full_eval = False, + fp16_full_eval = False, + tf32 = None, + local_rank = -1, + ddp_backend = None, + tpu_num_cores = None, + tpu_metrics_debug = False, + debug = '', + dataloader_drop_last = False, + eval_steps = None, + dataloader_num_workers = 0, + dataloader_prefetch_factor = None, + past_index = -1, + run_name = None, + disable_tqdm = None, + remove_unused_columns = True, + label_names = None, + load_best_model_at_end = False, + metric_for_best_model = None, + greater_is_better = None, + ignore_data_skip = False, + fsdp = '', + fsdp_min_num_params = 0, + fsdp_config = None, + fsdp_transformer_layer_cls_to_wrap = None, + accelerator_config = None, + deepspeed = None, + label_smoothing_factor = 0.0, + optim = 'adamw_8bit', + optim_args = None, + adafactor = False, + group_by_length = False, + length_column_name = 'length', + report_to = None, + ddp_find_unused_parameters = None, + ddp_bucket_cap_mb = None, + ddp_broadcast_buffers = None, + dataloader_pin_memory = True, + dataloader_persistent_workers = False, + skip_memory_metrics = True, + use_legacy_prediction_loop = False, + push_to_hub = False, + resume_from_checkpoint = None, + hub_model_id = None, + hub_strategy = 'every_save', + hub_token = None, + hub_private_repo = None, + hub_always_push = False, + hub_revision = None, + gradient_checkpointing = False, + gradient_checkpointing_kwargs = None, + include_inputs_for_metrics = False, + eval_do_concat_batches = True, + fp16_backend = 'auto', + push_to_hub_model_id = None, + push_to_hub_organization = None, + push_to_hub_token = None, + mp_parameters = '', + auto_find_batch_size = False, + full_determinism = False, + torchdynamo = None, + ray_scope = 'last', + ddp_timeout = 1800, + torch_compile = False, + torch_compile_backend = None, + torch_compile_mode = None, + include_tokens_per_second = False, + include_num_input_tokens_seen = False, + neftune_noise_alpha = None, + optim_target_modules = None, + batch_eval_metrics = False, + eval_on_start = False, + use_liger_kernel = False, + liger_kernel_config = None, + eval_use_gather_object = False, + average_tokens_across_devices = True, + model_init_kwargs = None, + chat_template_path = None, + dataset_text_field = 'text', + dataset_kwargs = None, + dataset_num_proc = None, + eos_token = None, + pad_token = None, + max_length = 1024, + packing = False, + packing_strategy = 'ffd', + padding_free = False, + pad_to_multiple_of = None, + eval_packing = None, + completion_only_loss = None, + assistant_only_loss = False, + activation_offloading = False, + max_seq_length = None, + vllm_sampling_params = None, + unsloth_num_chunks = -1, + **kwargs, + ): + if learning_rate < 1e-7: raise FloatingPointError(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!') + if learning_rate > 1: raise OverflowError(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!') + if output_dir is None and save_strategy == 'steps' and save_steps == 500: + output_dir = 'unsloth_training_checkpoints' + save_strategy = 'no' + if dataset_num_proc is None: + from multiprocessing import cpu_count + dataset_num_proc = cpu_count() + + super().__init__( + output_dir = output_dir, + overwrite_output_dir = overwrite_output_dir, + do_train = do_train, + do_eval = do_eval, + do_predict = do_predict, + eval_strategy = eval_strategy, + prediction_loss_only = prediction_loss_only, + per_device_train_batch_size = per_device_train_batch_size, + per_device_eval_batch_size = per_device_eval_batch_size, + per_gpu_train_batch_size = per_gpu_train_batch_size, + per_gpu_eval_batch_size = per_gpu_eval_batch_size, + gradient_accumulation_steps = gradient_accumulation_steps, + eval_accumulation_steps = eval_accumulation_steps, + eval_delay = eval_delay, + torch_empty_cache_steps = torch_empty_cache_steps, + learning_rate = learning_rate, + weight_decay = weight_decay, + adam_beta1 = adam_beta1, + adam_beta2 = adam_beta2, + adam_epsilon = adam_epsilon, + max_grad_norm = max_grad_norm, + num_train_epochs = num_train_epochs, + max_steps = max_steps, + lr_scheduler_type = lr_scheduler_type, + warmup_ratio = warmup_ratio, + warmup_steps = warmup_steps, + log_level = log_level, + log_level_replica = log_level_replica, + log_on_each_node = log_on_each_node, + logging_dir = logging_dir, + logging_strategy = logging_strategy, + logging_first_step = logging_first_step, + logging_steps = logging_steps, + logging_nan_inf_filter = logging_nan_inf_filter, + save_strategy = save_strategy, + save_steps = save_steps, + save_total_limit = save_total_limit, + save_safetensors = save_safetensors, + save_on_each_node = save_on_each_node, + save_only_model = save_only_model, + restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint, + no_cuda = no_cuda, + use_cpu = use_cpu, + use_mps_device = use_mps_device, + seed = seed, + data_seed = data_seed, + jit_mode_eval = jit_mode_eval, + use_ipex = use_ipex, + bf16 = bf16, + fp16 = fp16, + fp16_opt_level = fp16_opt_level, + half_precision_backend = half_precision_backend, + bf16_full_eval = bf16_full_eval, + fp16_full_eval = fp16_full_eval, + tf32 = tf32, + local_rank = local_rank, + ddp_backend = ddp_backend, + tpu_num_cores = tpu_num_cores, + tpu_metrics_debug = tpu_metrics_debug, + debug = debug, + dataloader_drop_last = dataloader_drop_last, + eval_steps = eval_steps, + dataloader_num_workers = dataloader_num_workers, + dataloader_prefetch_factor = dataloader_prefetch_factor, + past_index = past_index, + run_name = run_name, + disable_tqdm = disable_tqdm, + remove_unused_columns = remove_unused_columns, + label_names = label_names, + load_best_model_at_end = load_best_model_at_end, + metric_for_best_model = metric_for_best_model, + greater_is_better = greater_is_better, + ignore_data_skip = ignore_data_skip, + fsdp = fsdp, + fsdp_min_num_params = fsdp_min_num_params, + fsdp_config = fsdp_config, + fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap, + accelerator_config = accelerator_config, + deepspeed = deepspeed, + label_smoothing_factor = label_smoothing_factor, + optim = optim, + optim_args = optim_args, + adafactor = adafactor, + group_by_length = group_by_length, + length_column_name = length_column_name, + report_to = report_to, + ddp_find_unused_parameters = ddp_find_unused_parameters, + ddp_bucket_cap_mb = ddp_bucket_cap_mb, + ddp_broadcast_buffers = ddp_broadcast_buffers, + dataloader_pin_memory = dataloader_pin_memory, + dataloader_persistent_workers = dataloader_persistent_workers, + skip_memory_metrics = skip_memory_metrics, + use_legacy_prediction_loop = use_legacy_prediction_loop, + push_to_hub = push_to_hub, + resume_from_checkpoint = resume_from_checkpoint, + hub_model_id = hub_model_id, + hub_strategy = hub_strategy, + hub_token = hub_token, + hub_private_repo = hub_private_repo, + hub_always_push = hub_always_push, + hub_revision = hub_revision, + gradient_checkpointing = gradient_checkpointing, + gradient_checkpointing_kwargs = gradient_checkpointing_kwargs, + include_inputs_for_metrics = include_inputs_for_metrics, + eval_do_concat_batches = eval_do_concat_batches, + fp16_backend = fp16_backend, + push_to_hub_model_id = push_to_hub_model_id, + push_to_hub_organization = push_to_hub_organization, + push_to_hub_token = push_to_hub_token, + mp_parameters = mp_parameters, + auto_find_batch_size = auto_find_batch_size, + full_determinism = full_determinism, + torchdynamo = torchdynamo, + ray_scope = ray_scope, + ddp_timeout = ddp_timeout, + torch_compile = torch_compile, + torch_compile_backend = torch_compile_backend, + torch_compile_mode = torch_compile_mode, + include_tokens_per_second = include_tokens_per_second, + include_num_input_tokens_seen = include_num_input_tokens_seen, + neftune_noise_alpha = neftune_noise_alpha, + optim_target_modules = optim_target_modules, + batch_eval_metrics = batch_eval_metrics, + eval_on_start = eval_on_start, + use_liger_kernel = use_liger_kernel, + liger_kernel_config = liger_kernel_config, + eval_use_gather_object = eval_use_gather_object, + average_tokens_across_devices = average_tokens_across_devices, + model_init_kwargs = model_init_kwargs, + chat_template_path = chat_template_path, + dataset_text_field = dataset_text_field, + dataset_kwargs = dataset_kwargs, + dataset_num_proc = dataset_num_proc, + eos_token = eos_token, + pad_token = pad_token, + max_length = max_length, + packing = packing, + packing_strategy = packing_strategy, + padding_free = padding_free, + pad_to_multiple_of = pad_to_multiple_of, + eval_packing = eval_packing, + completion_only_loss = completion_only_loss, + assistant_only_loss = assistant_only_loss, + activation_offloading = activation_offloading, + max_seq_length = max_seq_length,**kwargs) + self.vllm_sampling_params = vllm_sampling_params + self.unsloth_num_chunks = unsloth_num_chunks +pass + +class _UnslothSFTTrainer(Trainer): + """""" + + _tag_names = ["trl", "sft"] + + def __init__( + self, + model: Union[str, nn.Module, PreTrainedModel], + args: Optional[Union[SFTConfig, TrainingArguments]] = None, + data_collator: Optional[DataCollator] = None, # type: ignore + train_dataset: Optional[Union[Dataset, IterableDataset]] = None, + eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None, + processing_class: Optional[ + Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin] + ] = None, + compute_loss_func: Optional[Callable] = None, + compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None, + callbacks: Optional[list[TrainerCallback]] = None, + optimizers: tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None), + optimizer_cls_and_kwargs: Optional[tuple[type[torch.optim.Optimizer], dict[str, Any]]] = None, + preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, + peft_config: Optional["PeftConfig"] = None, + formatting_func: Optional[Callable[[dict], str]] = None, + ): + # Args + model_id = model if isinstance(model, str) else model.config._name_or_path + if args is None: + model_name = model_id.split("/")[-1] + args = SFTConfig(f"{model_name}-SFT") + elif isinstance(args, TrainingArguments) and not isinstance(args, SFTConfig): + dict_args = args.to_dict() + dict_args["hub_token"] = args.hub_token # to_dict hides the hub_token + dict_args.pop("push_to_hub_token") + args = SFTConfig(**dict_args) + + # Handle the tokenizer + if processing_class is None: + processing_class = AutoTokenizer.from_pretrained(model_id) + + if args.eos_token is not None: + eos_token = args.eos_token + eos_token_id = processing_class.convert_tokens_to_ids(eos_token) + if eos_token_id is None: + raise ValueError( + f"The specified `eos_token` ('{eos_token}') is not found in the vocabulary of the given " + f"`processing_class` ({processing_class.__class__.__name__}). Ensure that the `eos_token` exists " + "in the vocabulary before using it as an EOS token." + ) + processing_class.eos_token_id = eos_token_id + + # Model + if args.model_init_kwargs is not None and not isinstance(model, str): + warnings.warn( + "You passed model_init_kwargs to the `SFTConfig`, but your model is already instantiated. " + "The `model_init_kwargs` will be ignored." + ) + if isinstance(model, str): + model = self._create_model_from_path(model, args) + + if args.chat_template_path is not None: + if os.path.isfile(args.chat_template_path) and args.chat_template_path.endswith((".jinja", ".j2")): + with open(args.chat_template_path, encoding="utf-8") as chat_template_file: + processing_class.chat_template = chat_template_file.read() + else: + model, processing_class = clone_chat_template(model, processing_class, args.chat_template_path) + + # PEFT configuration and model wrapping + if False: + model = self._prepare_peft_model(model, peft_config, args) + + # Data collator + # FFD packing requires padding-free mode; otherwise, the collator outputs padded attention masks, causing + # FlashAttention to ignore position_ids and recompute them incorrectly from the padded attention mask. + self.padding_free = args.padding_free or (args.packing and args.packing_strategy == "ffd") + if self.padding_free: + if data_collator is not None: + raise ValueError("Passing a custom data collator is not supported when using padding-free.") + if args.packing and args.packing_strategy == "wrapped": + warnings.warn( + "You are passing `padding_free=True` with the 'wrapped' packing strategy, which is not " + "recommended. Please refer to the documentation to understand why this is not recommended." + ) + if model.config._attn_implementation != "flash_attention_2": + warnings.warn( + "Padding-free training is enabled, but the attention implementation is not set to " + "'flash_attention_2'. Padding-free training flattens batches into a single sequence, and " + "'flash_attention_2' is the only known attention mechanism that reliably supports this. Using " + "other implementations may lead to unexpected behavior. To ensure compatibility, set " + "`attn_implementation='flash_attention_2'` in the model configuration, or verify that your " + "attention mechanism can handle flattened sequences." + ) + if args.per_device_train_batch_size == 1 and not args.packing: + warnings.warn( + "You are using a per_device_train_batch_size of 1 with padding-free training. Using a batch size " + "of 1 anihilate the benefits of padding-free training. Please consider increasing the batch size " + "to at least 2." + ) + + if args.completion_only_loss is None: + first_example = next(iter(train_dataset)) + self.completion_only_loss = "prompt" in first_example + else: + self.completion_only_loss = args.completion_only_loss + + if data_collator is None: + # Get the pad token: if not provided, use the one from the processing class or the eos token + # if the processing class does not have a pad token. + pad_token = args.pad_token or processing_class.pad_token or processing_class.eos_token + pad_token_id = processing_class.convert_tokens_to_ids(pad_token) + if pad_token_id is None: + raise ValueError( + f"The specified `pad_token` ('{pad_token}') is not found in the vocabulary of the given " + f"`processing_class` ({processing_class.__class__.__name__}). Ensure that the `pad_token` exists " + "in the vocabulary before using it as a padding token." + ) + data_collator = DataCollatorForLanguageModeling( + pad_token_id=pad_token_id, + completion_only_loss=self.completion_only_loss, + padding_free=self.padding_free, + # Using position_ids without flash_attn hurts the training + return_position_ids=model.config._attn_implementation == "flash_attention_2", + pad_to_multiple_of=args.pad_to_multiple_of, + ) + + if ( + args.packing + and args.packing_strategy == "ffd" + and model.config._attn_implementation != "flash_attention_2" + ): + warnings.warn( + "You are using packing, but the attention implementation is not set to 'flash_attention_2'. Packing " + "flattens batches into a single sequence, and 'flash_attention_2' is the only known attention " + "mechanism that reliably supports this. Using other implementations may lead to cross-contamination " + "between batches. To avoid this, either disable packing by setting `packing=False`, or set " + "`attn_implementation='flash_attention_2'` in the model configuration." + ) + if args.assistant_only_loss and not is_conversational(train_dataset[0]): + raise ValueError( + "You set `assistant_only_loss=True`, but the dataset is not conversational. This option is only " + "supported for conversational datasets." + ) + + # Dataset + preprocess_dataset = args.dataset_kwargs is None or not args.dataset_kwargs.get("skip_prepare_dataset", False) + if preprocess_dataset: + if self.completion_only_loss and formatting_func: + raise ValueError( + "A formatting function was provided while `completion_only_loss=True`, which is incompatible. " + "Using a formatter converts the dataset to a language modeling type, conflicting with " + "completion-only loss. To resolve this, apply your formatting function before passing the " + "dataset, or disable `completion_only_loss` in `SFTConfig`." + ) + train_dataset = self._prepare_dataset( + train_dataset, processing_class, args, args.packing, formatting_func, "train" + ) + if eval_dataset is not None: + packing = args.packing if args.eval_packing is None else args.eval_packing + if isinstance(eval_dataset, dict): + eval_dataset = { + key: self._prepare_dataset(dataset, processing_class, args, packing, formatting_func, key) + for key, dataset in eval_dataset.items() + } + else: + eval_dataset = self._prepare_dataset( + eval_dataset, processing_class, args, packing, formatting_func, "eval" + ) + + # Initialize the metrics + self._metrics = {"train": defaultdict(list), "eval": defaultdict(list)} + self._total_train_tokens = 0 + + # Initialize the Trainer. Parent class will handle: + # - DeepSpeed configuration [through create_accelerator_and_postprocess] + # - FSDP setup + # - Distributed training setup + # - Optimizer and scheduler creation + + super().__init__( + model=model, + args=args, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + processing_class=processing_class, + compute_loss_func=compute_loss_func, + compute_metrics=compute_metrics, + callbacks=callbacks, + optimizers=optimizers, + optimizer_cls_and_kwargs=optimizer_cls_and_kwargs, + preprocess_logits_for_metrics=preprocess_logits_for_metrics, + ) + + # Initialize activation offloading context + if self.args.activation_offloading: + self.maybe_activation_offload_context = get_act_offloading_ctx_manager(model=self.model) + else: + self.maybe_activation_offload_context = contextlib.nullcontext() + + # Add tags for models that have been loaded with the correct transformers version + if hasattr(self.model, "add_model_tags"): + self.model.add_model_tags(self._tag_names) + + def _create_model_from_path(self, model_path: str, args: SFTConfig) -> PreTrainedModel: + """Creates a model from a path or model identifier.""" + model_init_kwargs = args.model_init_kwargs or {} + # Handle torch dtype + torch_dtype = model_init_kwargs.get("torch_dtype") + if isinstance(torch_dtype, torch.dtype) or torch_dtype == "auto" or torch_dtype is None: + pass # torch_dtype is already a torch.dtype or "auto" or None + elif isinstance(torch_dtype, str): # it's a str, but not "auto" + torch_dtype = getattr(torch, torch_dtype) + model_init_kwargs["torch_dtype"] = torch_dtype + else: + raise ValueError( + "Invalid `torch_dtype` passed to `SFTConfig`. Expected either 'auto' or a string representing " + f"a `torch.dtype` (e.g., 'float32'), but got {torch_dtype}." + ) + # Disable caching if gradient checkpointing is enabled (not supported) + # if args.gradient_checkpointing: + # model_init_kwargs["use_cache"] = False + + # Create model + model = AutoModelForCausalLM.from_pretrained(model_path, **model_init_kwargs) + return model + + def _prepare_peft_model(self, model: PreTrainedModel, peft_config: Any, args: SFTConfig) -> PreTrainedModel: + """Prepares a model for PEFT training.""" + if not is_peft_available(): + raise ImportError("To use PeftModel, you need to install the `peft` library.") + + if not isinstance(peft_config, PeftConfig): + raise ValueError( + f"Expected PeftConfig object but got {type(peft_config)}. If you want to use the PeftModel, you need " + "to pass a PeftConfig object to the SFTTrainer." + ) + + if isinstance(model, PeftModel): + return model + + # Handle quantized models (QLoRA) + is_qlora = getattr(model, "is_loaded_in_4bit", False) or getattr(model, "is_loaded_in_8bit", False) + + is_sharded_qlora = False + if getattr(model, "is_loaded_in_4bit", False): + # Check if model is sharded (FSDP/DS-Zero3) + for _, param in model.named_parameters(): + if param.__class__.__name__ == "Params4bit": + is_sharded_qlora = param.data.device.type in {"cpu", "meta"} + break + + # Prepare model for kbit training if needed + if is_qlora and not is_sharded_qlora: + model = self._prepare_model_for_kbit_training(model, args) + # Disable gradient checkpointing as it's handled by prepare_model_for_kbit_training + args = dataclasses.replace(args, gradient_checkpointing=False) + elif args.gradient_checkpointing: + model = self._enable_gradient_checkpointing(model, args) + + # Create PEFT model + if ( + version.parse(peft.__version__) >= version.parse("0.12") # autocast_adapter_dtype introduced in 0.12 + and getattr(model, "is_loaded_in_4bit", False) + and is_sharded_qlora + ): + model = get_peft_model(model, peft_config, autocast_adapter_dtype=False) + else: + model = get_peft_model(model, peft_config) + + # Handle bf16 casting for 4-bit models + if args.bf16 and getattr(model, "is_loaded_in_4bit", False) and not is_sharded_qlora: + peft_module_casting_to_bf16(model) + + return model + + def _prepare_model_for_kbit_training(self, model: PreTrainedModel, args: SFTConfig) -> PreTrainedModel: + """Prepares a quantized model for kbit training.""" + prepare_model_kwargs = { + "use_gradient_checkpointing": args.gradient_checkpointing, + "gradient_checkpointing_kwargs": args.gradient_checkpointing_kwargs or {}, + } + + return prepare_model_for_kbit_training(model, **prepare_model_kwargs) + + def _enable_gradient_checkpointing(self, model: PreTrainedModel, args: SFTConfig) -> PreTrainedModel: + """Enables gradient checkpointing for the model.""" + gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs or {} + use_reentrant = ( + "use_reentrant" not in gradient_checkpointing_kwargs or gradient_checkpointing_kwargs["use_reentrant"] + ) + + if use_reentrant: + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + else: + + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + return model + + def _prepare_dataset( + self, + dataset: Union[Dataset, IterableDataset], + processing_class, + args, + packing: bool, + formatting_func: Optional[Callable[[dict], str]], + dataset_name: str, + ) -> Union[Dataset, IterableDataset]: + # All Unsloth Zoo code licensed under LGPLv3 + if isinstance(dataset, ConstantLengthDataset): return dataset + + map_kwargs = {} + use_desc = isinstance(dataset, Dataset) + is_vlm = hasattr(processing_class, "tokenizer") + tokenizer = processing_class + if is_vlm: tokenizer = processing_class.tokenizer + + # Get max length + max_seq_length = getattr(args, "max_length", 0) + if max_seq_length == 0: max_seq_length = getattr(args, "max_seq_length", 0) + if max_seq_length == 0: max_seq_length = getattr(self, "max_seq_length", 0) + if max_seq_length == 0: max_seq_length = getattr(self, "max_seq", 0) + if max_seq_length == 0: raise RuntimeError("Unsloth: max_seq_length is 0! Please specify one!") + dataset_text_field = getattr(args, "dataset_text_field", "text") + do_truncation = max_seq_length != 0 + do_formatting_func = False + do_tokenize = True + + # Get correct column names + column_names = set(next(iter(dataset)).keys()) + used_column_names = ["input_ids"] + if "attention_mask" in column_names: + used_column_names.append("attention_mask") + + # Check if already tokenized so skip + from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling + if "labels" in column_names: + # Most likely forgot data collator! + if is_vlm and not hasattr(tokenizer, "pad"): + # Check if processing_class has a .pad, if not, use tokenizer.tokenizer + raise RuntimeError(f"Unsloth: {processing_class.__class__} does not have .pad!") + self.data_collator = DataCollatorForSeq2Seq(tokenizer) + used_column_names.append("labels") + do_tokenize = False + elif "input_ids" in column_names: + # Skip dataset prep, and set data collator + if is_vlm and not hasattr(tokenizer, "pad"): + # Check if processing_class has a .pad, if not, use tokenizer.tokenizer + raise RuntimeError(f"Unsloth: {processing_class.__class__} does not have .pad!") + self.data_collator = DataCollatorForLanguageModeling(tokenizer, mlm = False) + do_tokenize = False + elif dataset_text_field not in column_names: + do_formatting_func = True + if formatting_func is None: + raise RuntimeError("Unsloth: You must specify a `formatting_func`") + pass + + if do_tokenize: + # Check double BOS tokens + if do_formatting_func: + test_text = formatting_func(next(iter(dataset))) + if not isinstance(test_text, list): + raise ValueError( + "Unsloth: The `formatting_func` should return a list of processed strings." + ) + test_text = test_text[0] + else: + test_text = next(iter(dataset))[dataset_text_field][0] + + # Get chat template + chat_template = getattr(processing_class, 'chat_template', '') + if chat_template == '' and is_vlm: + chat_template = getattr(tokenizer, 'chat_template', '') + if chat_template is None: + chat_template = '' + + # Get bos_token + add_special_tokens = True + bos_token_1 = getattr(processing_class, 'bos_token', None) + bos_token_2 = getattr(tokenizer, 'bos_token', None) + bos_token = bos_token_1 or bos_token_2 + + if bos_token is not None: + if test_text.startswith(bos_token) or bos_token in chat_template: + add_special_tokens = False + print("Unsloth: We found double BOS tokens - we shall remove one automatically.") + pass + + # Create tokenize function + def _tokenize(example): + return tokenizer( + example[dataset_text_field] if not do_formatting_func else formatting_func(example), + truncation = do_truncation, + max_length = max_seq_length, + return_token_type_ids = False, + add_special_tokens = add_special_tokens, + ) + pass + + if not isinstance(dataset, IterableDataset): + map_kwargs["num_proc"] = getattr(args, "dataset_num_proc", 2) + else: + map_kwargs["batch_size"] = dataset._ex_iterable.batch_size + + if use_desc: map_kwargs["desc"] = f'Unsloth: Tokenizing ["{dataset_text_field}"]' + dataset = dataset.map(_tokenize, batched = True, **map_kwargs) + + # If VLM, switch data collator since .pad is needed! + if is_vlm and not hasattr(processing_class, "pad"): + data_collator = DataCollatorForLanguageModeling(tokenizer, mlm = False) + self.data_collator = data_collator + pass + pass + if packing: + print("Unsloth: Hugging Face's packing is currently buggy - we're disabling it for now!") + return dataset + + if max_seq_length == 0: + raise ValueError("When packing is enabled, `max_seq_length` can't be `None`.") + + if use_desc: map_kwargs["desc"] = f"Unsloth: Packing {dataset_name} dataset" + dataset = dataset.select_columns(used_column_names).map( + pack_examples, + batched = True, + fn_kwargs = {"seq_length": max_seq_length,}, + **map_kwargs, + ) + pass + return dataset + + def _set_signature_columns_if_needed(self): + # If `self.args.remove_unused_columns` is True, non-signature columns are removed. + # By default, this method sets `self._signature_columns` to the model's expected inputs (usually, "input_ids" + # and "attention_mask"). When using `train_on_completion_only` we add a "completion_mask" column to the + # dataset. So we need to override the default signature columns to include "completion_mask" as well. + if self._signature_columns is None: + self._signature_columns = [ + "input_ids", + "labels", + "position_ids", + "completion_mask", + "assistant_masks", + ] + + def compute_loss(self, model, inputs, return_outputs = False, num_items_in_batch = None): + outputs = super().compute_loss( + model, + inputs, + return_outputs = return_outputs, + num_items_in_batch = num_items_in_batch, + ) + return outputs + + # Override training step to add activation offloading context. + def training_step(self, *args, **kwargs): + with self.maybe_activation_offload_context: + return super().training_step(*args, **kwargs) + + def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None: + mode = "train" if self.model.training else "eval" + metrics = {key: sum(val) / len(val) for key, val in self._metrics[mode].items()} # average the metrics + + # This method can be called both in training and evaluation. When called in evaluation, the keys in `logs` + # start with "eval_". We need to add the prefix "eval_" to the keys in `metrics` to match the format. + if mode == "eval": + metrics = {f"eval_{key}": val for key, val in metrics.items()} + + logs = {**logs, **metrics} + super().log(logs, start_time) + self._metrics[mode].clear() + + # Ensure the model card is saved along with the checkpoint + def _save_checkpoint(self, model, trial): + if self.args.hub_model_id is None: + model_name = Path(self.args.output_dir).name + else: + model_name = self.args.hub_model_id.split("/")[-1] + self.create_model_card(model_name=model_name) + super()._save_checkpoint(model, trial) + + def create_model_card( + self, + model_name: Optional[str] = None, + dataset_name: Optional[str] = None, + tags: Union[str, list[str], None] = None, + ): + """ + Creates a draft of a model card using the information available to the `Trainer`. + + Args: + model_name (`str` or `None`, *optional*, defaults to `None`): + Name of the model. + dataset_name (`str` or `None`, *optional*, defaults to `None`): + Name of the dataset used for training. + tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`): + Tags to be associated with the model card. + """ + if not self.is_world_process_zero(): + return + + if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path): + base_model = self.model.config._name_or_path + else: + base_model = None + + # normalize `tags` to a mutable set + if tags is None: + tags = set() + elif isinstance(tags, str): + tags = {tags} + else: + tags = set(tags) + + if hasattr(self.model.config, "unsloth_version"): + tags.add("unsloth") + + tags.update(self._tag_names) + + model_card = generate_model_card( + base_model=base_model, + model_name=model_name, + hub_model_id=self.hub_model_id, + dataset_name=dataset_name, + tags=list(tags), + wandb_url=wandb.run.get_url() if is_wandb_available() and wandb.run is not None else None, + comet_url=get_comet_experiment_url(), + trainer_name="SFT", + ) + + model_card.save(os.path.join(self.args.output_dir, "README.md")) +class UnslothSFTTrainer(_UnslothSFTTrainer): + """ + + Trainer for Supervised Fine-Tuning (SFT) method. + + This class is a wrapper around the [`transformers.Trainer`] class and inherits all of its attributes and methods. + + Example: + + ```python + from datasets import load_dataset + from trl import SFTTrainer + + dataset = load_dataset("roneneldan/TinyStories", split="train[:1%]") + + trainer = SFTTrainer(model="Qwen/Qwen2-0.5B-Instruct", train_dataset=dataset) + trainer.train() + ``` + + Args: + model (`Union[str, PreTrainedModel]`): + Model to be trained. Can be either: + + - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a + path to a *directory* containing model weights saved using + [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded + using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in + `args.model_init_kwargs`. + - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported. + args ([`SFTConfig`], *optional*, defaults to `None`): + Configuration for this trainer. If `None`, a default configuration is used. + data_collator (`DataCollator`, *optional*): + Function to use to form a batch from a list of elements of the processed `train_dataset` or `eval_dataset`. + Will default to a custom [`DataCollatorForLanguageModeling`]. + train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]): + Dataset to use for training. SFT supports both [language modeling](#language-modeling) type and + [prompt-completion](#prompt-completion) type. The format of the samples can be either: + + - [Standard](dataset_formats#standard): Each sample contains plain text. + - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role + and content). + + The trainer also supports processed datasets (tokenized) as long as they contain an `input_ids` field. + eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`): + Dataset to use for evaluation. It must meet the same requirements as `train_dataset`. + processing_class ([`~transformers.PreTrainedTokenizerBase`], *optional*, defaults to `None`): + Processing class used to process the data. If `None`, the processing class is loaded from the model's name + with [`~transformers.AutoTokenizer.from_pretrained`]. + callbacks (list of [`~transformers.TrainerCallback`], *optional*, defaults to `None`): + List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed + in [here](https://huggingface.co/docs/transformers/main_classes/callback). + + If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`] + method. + optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`): + A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your + model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`. + optimizer_cls_and_kwargs (`Tuple[Type[torch.optim.Optimizer], Dict[str, Any]]`, *optional*, defaults to `None`): + A tuple containing the optimizer class and keyword arguments to use. Overrides `optim` and `optim_args` in + `args`. Incompatible with the `optimizers` argument. + + Unlike `optimizers`, this argument avoids the need to place model parameters on the correct devices before + initializing the Trainer. + preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*, defaults to `None`): + A function that preprocess the logits right before caching them at each evaluation step. Must take two + tensors, the logits and the labels, and return the logits once processed as desired. The modifications made + by this function will be reflected in the predictions received by `compute_metrics`. + + Note that the labels (second parameter) will be `None` if the dataset does not have them. + peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`): + PEFT configuration used to wrap the model. If `None`, the model is not wrapped. + formatting_func (`Optional[Callable]`): + Formatting function applied to the dataset before tokenization. Applying the formatting function explicitly + converts the dataset into a [language modeling](#language-modeling) type. + + """ + def __init__( + self, + model, + args = None, + data_collator = None, + train_dataset = None, + eval_dataset = None, + processing_class = None, + compute_loss_func = None, + compute_metrics = None, + callbacks = None, + optimizer_cls_and_kwargs = None, + preprocess_logits_for_metrics = None, + peft_config = None, + formatting_func = None, + **kwargs + ): + if args is None: args = UnslothSFTConfig() + use_bf16 = getattr(args, 'bf16', False) + if type(use_bf16) is not bool: use_bf16 = False + use_fp16 = getattr(args, 'fp16', False) + if type(use_fp16) is not bool: use_fp16 = False + force_float32 = False + if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1': + print('Unsloth: Switching to float32 training since model cannot work with float16') + force_float32 = True + mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') + dtype = getattr(model.config, 'torch_dtype', None) + if dtype is None: dtype = model.get_input_embeddings().dtype + from unsloth_zoo.utils import _get_dtype + dtype = _get_dtype(dtype) + float16 = dtype == torch.float16 + if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`') + if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`') + if force_float32: + args.fp16 = False + args.bf16 = False + os.environ['ACCELERATE_MIXED_PRECISION'] = 'no' + elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32': + args.fp16 = float16 + args.bf16 = not float16 + os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16' + if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no': + args.eval_strategy = 'steps' + if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1 + ga_steps = getattr(args, 'gradient_accumulation_steps', None) + if ga_steps is not None and ga_steps > 1: + from transformers import __version__ as transformers_version + if Version(transformers_version) <= Version('4.45.2'): + print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n' + '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`') + if getattr(args, 'eval_strategy', 'no') != 'no': + eval_bsz = getattr(args, 'per_device_eval_batch_size', 8) + if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size + if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps + fp16_full_eval = getattr(args, 'fp16_full_eval', False) + if type(fp16_full_eval) is not bool: fp16_full_eval = False + bf16_full_eval = getattr(args, 'bf16_full_eval', False) + if type(bf16_full_eval) is not bool: bf16_full_eval = False + if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True + if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False + if force_float32: + args.bf16_full_eval = False + args.fp16_full_eval = False + elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16': + args.bf16_full_eval = True + args.fp16_full_eval = False + elif not bf16_full_eval and not fp16_full_eval: + args.bf16_full_eval = args.bf16 + args.fp16_full_eval = args.fp16 + _output_logits = False + if locals().get('compute_metrics', None) is not None: _output_logits = True + if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True + if _output_logits: + os.environ['UNSLOTH_RETURN_LOGITS'] = '1' + if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'): + pass + else: + model_max_seq_length = getattr(model, 'max_seq_length', None) + args_max_seq_length = getattr(args, 'max_seq_length', None) + if args_max_seq_length is None and model_max_seq_length is not None: + max_seq_length = model.max_seq_length + if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length + if 'max_length' not in locals() and not hasattr(args, 'max_length'): + pass + else: + if hasattr(args, 'max_seq_length') and args.max_seq_length is not None and args.max_seq_length > 0: + if hasattr(args, 'max_length'): + args.max_length = args.max_seq_length + max_length = args.max_length + else: + model_max_length = getattr(model, 'max_seq_length', None) + # print(model_max_length, 'mml1') + if model_max_length is None: model_max_length = getattr(model, 'max_length', None) + # print(model_max_length, 'mml2') + if model_max_length is not None: + args.max_length = model_max_length + max_length = args.max_length + elif hasattr(args, 'max_length') and args.max_length is not None: + max_length = args.max_length + # if we are here, then we are in a weird case where max_length is set but max_seq_length is not set + setattr(model, 'max_seq_length', max_length) + else: + print('Unsloth: We did not find `max_seq_length` or `max_length` in the model or args. We will set it to 1024.') + args.max_length = 1024 + if model is not None and hasattr(model, 'for_training'): + model.for_training() + if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right' + if 'processing_class' in locals(): + if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right' + if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right' + __tokenizer = processing_class if 'processing_class' in locals() else tokenizer + from unsloth_zoo.vision_utils import UnslothVisionDataCollator + if not isinstance(data_collator, UnslothVisionDataCollator): + if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer, mlm = False, mlm_probability = 0.0) + elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names: + data_collator = DataCollatorForSeq2Seq(__tokenizer) + else: + if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False + if hasattr(args, 'dataset_text_field'): args.dataset_text_field = '' + if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True} + if not isinstance(data_collator, UnslothVisionDataCollator): + if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'): + if isinstance(data_collator, DataCollatorForSeq2Seq): + data_collator = DataCollatorForSeq2Seq(__tokenizer.tokenizer) + else: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer.tokenizer, mlm = False, mlm_probability = 0.0) + other_metrics = [] + + from unsloth_zoo.logging_utils import PatchRLStatistics + PatchRLStatistics('sft_trainer', other_metrics) + IGNORED_TOKENIZER_NAMES = os.environ.get('UNSLOTH_IGNORED_TOKENIZER_NAMES', '').split('\n') + from unsloth_zoo.tokenizer_utils import fix_untrained_tokens + from unsloth_zoo.training_utils import fix_zero_training_loss + if 'tokenizer' not in locals(): tokenizer = processing_class + fix_untrained_tokens(model, tokenizer, train_dataset, IGNORED_TOKENIZER_NAMES, eps = 1e-16) + fix_zero_training_loss(model, tokenizer, train_dataset) + + super().__init__( + model = model, + args = args, + data_collator = data_collator, + train_dataset = train_dataset, + eval_dataset = eval_dataset, + processing_class = processing_class, + compute_loss_func = compute_loss_func, + compute_metrics = compute_metrics, + callbacks = callbacks, + optimizer_cls_and_kwargs = optimizer_cls_and_kwargs, + preprocess_logits_for_metrics = preprocess_logits_for_metrics, + peft_config = peft_config, + formatting_func = formatting_func,**kwargs) + if hasattr(self, 'neftune_hook_handle'): + self.neftune_hook_handle.remove() + if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle + if getattr(args, 'neftune_noise_alpha', None) is not None: + model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha + pass + +pass diff --git a/unsloth_compiled_cache/UnslothXPOTrainer.py b/unsloth_compiled_cache/UnslothXPOTrainer.py new file mode 100644 index 0000000000000000000000000000000000000000..8a00e952d056e2fdad3984f40c18d297c94a3056 --- /dev/null +++ b/unsloth_compiled_cache/UnslothXPOTrainer.py @@ -0,0 +1,1036 @@ +""" +2025.7.4 +2025.7.3 +4.53.2 +0.19.1 +__UNSLOTH_VERSIONING__ +""" +from torch import Tensor +import torch +import torch.nn as nn +from torch.nn import functional as F +from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable +from trl.trainer.xpo_trainer import (Any, BaseImageProcessor, BasePairwiseJudge, Callable, Dataset, EvalPrediction, F, FeatureExtractionMixin, IterableDataset, OnlineDPOTrainer, OptimizerNames, Optional, PeftModel, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, SIMPLE_CHAT_TEMPLATE, TrainerCallback, Union, XPOConfig, XPOTrainer, empty_cache, generate_model_card, get_comet_experiment_url, get_reward, is_conversational, is_peft_available, is_wandb_available, jinja2, maybe_apply_chat_template, nn, os, textwrap, torch, truncate_right, unwrap_model_for_generation, wandb) + + +import os +from typing import * +from dataclasses import dataclass, field +from packaging.version import Version +import torch +import numpy as np +from contextlib import nullcontext +from torch.nn import functional as F +from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling + +torch_compile_options = { + "epilogue_fusion" : True, + "max_autotune" : False, + "shape_padding" : True, + "trace.enabled" : False, + "triton.cudagraphs" : False, +} + +@torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) +def selective_log_softmax(logits, index): + logits = logits.to(torch.float32) + selected_logits = torch.gather(logits, dim = -1, index = index.unsqueeze(-1)).squeeze(-1) + # loop to reduce peak mem consumption + # logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits]) + logsumexp_values = torch.logsumexp(logits, dim = -1) + per_token_logps = selected_logits - logsumexp_values # log_softmax(x_i) = x_i - logsumexp(x) + return per_token_logps +@dataclass +class UnslothXPOConfig(XPOConfig): + """ + + Configuration class for the [`XPOTrainer`]. + + Subclass of [`OnlineDPOConfig`] we can use all its arguments and add the following: + + Parameters: + alpha (`float` or `list[float]`, *optional*, defaults to `1e-5`): + Weight of the XPO loss term. If a list of floats is provided then the alpha is selected for each new epoch + and the last alpha is used for the rest of the epochs. + + """ + vllm_sampling_params: Optional[Any] = field( + default = None, + metadata = {'help': 'vLLM SamplingParams'}, + ) + unsloth_num_chunks : Optional[int] = field( + default = -1, + metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'}, + ) + def __init__( + self, + output_dir = None, + overwrite_output_dir = None, + do_train = False, + do_eval = False, + do_predict = False, + eval_strategy = 'no', + prediction_loss_only = False, + per_device_train_batch_size = 4, + per_device_eval_batch_size = 4, + per_gpu_train_batch_size = None, + per_gpu_eval_batch_size = None, + gradient_accumulation_steps = 2, + eval_accumulation_steps = 2, + eval_delay = 0, + torch_empty_cache_steps = 250, + learning_rate = 5e-05, + weight_decay = 0.01, + adam_beta1 = 0.9, + adam_beta2 = 0.999, + adam_epsilon = 1e-08, + max_grad_norm = 1.0, + num_train_epochs = 3.0, + max_steps = -1, + lr_scheduler_type = 'linear', + warmup_ratio = 0.1, + warmup_steps = 0, + log_level = 'passive', + log_level_replica = 'warning', + log_on_each_node = True, + logging_dir = None, + logging_strategy = 'steps', + logging_first_step = False, + logging_steps = 1, + logging_nan_inf_filter = False, + save_strategy = 'steps', + save_steps = 500, + save_total_limit = None, + save_safetensors = True, + save_on_each_node = False, + save_only_model = False, + restore_callback_states_from_checkpoint = False, + no_cuda = False, + use_cpu = False, + use_mps_device = False, + seed = 3407, + data_seed = 3407, + jit_mode_eval = False, + use_ipex = False, + bf16 = False, + fp16 = False, + fp16_opt_level = 'O1', + half_precision_backend = 'auto', + bf16_full_eval = False, + fp16_full_eval = False, + tf32 = None, + local_rank = -1, + ddp_backend = None, + tpu_num_cores = None, + tpu_metrics_debug = False, + debug = '', + dataloader_drop_last = False, + eval_steps = None, + dataloader_num_workers = 0, + dataloader_prefetch_factor = None, + past_index = -1, + run_name = None, + disable_tqdm = None, + remove_unused_columns = True, + label_names = None, + load_best_model_at_end = False, + metric_for_best_model = None, + greater_is_better = None, + ignore_data_skip = False, + fsdp = '', + fsdp_min_num_params = 0, + fsdp_config = None, + fsdp_transformer_layer_cls_to_wrap = None, + accelerator_config = None, + deepspeed = None, + label_smoothing_factor = 0.0, + optim = 'adamw_8bit', + optim_args = None, + adafactor = False, + group_by_length = False, + length_column_name = 'length', + report_to = None, + ddp_find_unused_parameters = None, + ddp_bucket_cap_mb = None, + ddp_broadcast_buffers = None, + dataloader_pin_memory = True, + dataloader_persistent_workers = False, + skip_memory_metrics = True, + use_legacy_prediction_loop = False, + push_to_hub = False, + resume_from_checkpoint = None, + hub_model_id = None, + hub_strategy = 'every_save', + hub_token = None, + hub_private_repo = None, + hub_always_push = False, + hub_revision = None, + gradient_checkpointing = False, + gradient_checkpointing_kwargs = None, + include_inputs_for_metrics = False, + eval_do_concat_batches = True, + fp16_backend = 'auto', + push_to_hub_model_id = None, + push_to_hub_organization = None, + push_to_hub_token = None, + mp_parameters = '', + auto_find_batch_size = False, + full_determinism = False, + torchdynamo = None, + ray_scope = 'last', + ddp_timeout = 1800, + torch_compile = False, + torch_compile_backend = None, + torch_compile_mode = None, + include_tokens_per_second = False, + include_num_input_tokens_seen = False, + neftune_noise_alpha = None, + optim_target_modules = None, + batch_eval_metrics = False, + eval_on_start = False, + use_liger_kernel = False, + liger_kernel_config = None, + eval_use_gather_object = False, + average_tokens_across_devices = False, + reward_model_path = None, + judge = None, + max_new_tokens = 64, + max_length = 512, + temperature = 0.9, + missing_eos_penalty = None, + loss_type = 'sigmoid', + dataset_num_proc = None, + disable_dropout = True, + use_vllm = False, + gpu_memory_utilization = 0.55, + ds3_gather_for_generation = True, + vllm_sampling_params = None, + unsloth_num_chunks = -1, + **kwargs, + ): + if learning_rate < 1e-7: raise FloatingPointError(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!') + if learning_rate > 1: raise OverflowError(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!') + if output_dir is None and save_strategy == 'steps' and save_steps == 500: + output_dir = 'unsloth_training_checkpoints' + save_strategy = 'no' + if dataset_num_proc is None: + from multiprocessing import cpu_count + dataset_num_proc = cpu_count() + if temperature <= 0: + raise MathError('Unsloth: Please set a positive non-zero temperature since your results will be wrong.') + elif temperature >= 10: + raise MathError('Unsloth: Please set a positive non-zero temperature less than 10, since sampling will be quite erratic.') + + + super().__init__( + output_dir = output_dir, + overwrite_output_dir = overwrite_output_dir, + do_train = do_train, + do_eval = do_eval, + do_predict = do_predict, + eval_strategy = eval_strategy, + prediction_loss_only = prediction_loss_only, + per_device_train_batch_size = per_device_train_batch_size, + per_device_eval_batch_size = per_device_eval_batch_size, + per_gpu_train_batch_size = per_gpu_train_batch_size, + per_gpu_eval_batch_size = per_gpu_eval_batch_size, + gradient_accumulation_steps = gradient_accumulation_steps, + eval_accumulation_steps = eval_accumulation_steps, + eval_delay = eval_delay, + torch_empty_cache_steps = torch_empty_cache_steps, + learning_rate = learning_rate, + weight_decay = weight_decay, + adam_beta1 = adam_beta1, + adam_beta2 = adam_beta2, + adam_epsilon = adam_epsilon, + max_grad_norm = max_grad_norm, + num_train_epochs = num_train_epochs, + max_steps = max_steps, + lr_scheduler_type = lr_scheduler_type, + warmup_ratio = warmup_ratio, + warmup_steps = warmup_steps, + log_level = log_level, + log_level_replica = log_level_replica, + log_on_each_node = log_on_each_node, + logging_dir = logging_dir, + logging_strategy = logging_strategy, + logging_first_step = logging_first_step, + logging_steps = logging_steps, + logging_nan_inf_filter = logging_nan_inf_filter, + save_strategy = save_strategy, + save_steps = save_steps, + save_total_limit = save_total_limit, + save_safetensors = save_safetensors, + save_on_each_node = save_on_each_node, + save_only_model = save_only_model, + restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint, + no_cuda = no_cuda, + use_cpu = use_cpu, + use_mps_device = use_mps_device, + seed = seed, + data_seed = data_seed, + jit_mode_eval = jit_mode_eval, + use_ipex = use_ipex, + bf16 = bf16, + fp16 = fp16, + fp16_opt_level = fp16_opt_level, + half_precision_backend = half_precision_backend, + bf16_full_eval = bf16_full_eval, + fp16_full_eval = fp16_full_eval, + tf32 = tf32, + local_rank = local_rank, + ddp_backend = ddp_backend, + tpu_num_cores = tpu_num_cores, + tpu_metrics_debug = tpu_metrics_debug, + debug = debug, + dataloader_drop_last = dataloader_drop_last, + eval_steps = eval_steps, + dataloader_num_workers = dataloader_num_workers, + dataloader_prefetch_factor = dataloader_prefetch_factor, + past_index = past_index, + run_name = run_name, + disable_tqdm = disable_tqdm, + remove_unused_columns = remove_unused_columns, + label_names = label_names, + load_best_model_at_end = load_best_model_at_end, + metric_for_best_model = metric_for_best_model, + greater_is_better = greater_is_better, + ignore_data_skip = ignore_data_skip, + fsdp = fsdp, + fsdp_min_num_params = fsdp_min_num_params, + fsdp_config = fsdp_config, + fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap, + accelerator_config = accelerator_config, + deepspeed = deepspeed, + label_smoothing_factor = label_smoothing_factor, + optim = optim, + optim_args = optim_args, + adafactor = adafactor, + group_by_length = group_by_length, + length_column_name = length_column_name, + report_to = report_to, + ddp_find_unused_parameters = ddp_find_unused_parameters, + ddp_bucket_cap_mb = ddp_bucket_cap_mb, + ddp_broadcast_buffers = ddp_broadcast_buffers, + dataloader_pin_memory = dataloader_pin_memory, + dataloader_persistent_workers = dataloader_persistent_workers, + skip_memory_metrics = skip_memory_metrics, + use_legacy_prediction_loop = use_legacy_prediction_loop, + push_to_hub = push_to_hub, + resume_from_checkpoint = resume_from_checkpoint, + hub_model_id = hub_model_id, + hub_strategy = hub_strategy, + hub_token = hub_token, + hub_private_repo = hub_private_repo, + hub_always_push = hub_always_push, + hub_revision = hub_revision, + gradient_checkpointing = gradient_checkpointing, + gradient_checkpointing_kwargs = gradient_checkpointing_kwargs, + include_inputs_for_metrics = include_inputs_for_metrics, + eval_do_concat_batches = eval_do_concat_batches, + fp16_backend = fp16_backend, + push_to_hub_model_id = push_to_hub_model_id, + push_to_hub_organization = push_to_hub_organization, + push_to_hub_token = push_to_hub_token, + mp_parameters = mp_parameters, + auto_find_batch_size = auto_find_batch_size, + full_determinism = full_determinism, + torchdynamo = torchdynamo, + ray_scope = ray_scope, + ddp_timeout = ddp_timeout, + torch_compile = torch_compile, + torch_compile_backend = torch_compile_backend, + torch_compile_mode = torch_compile_mode, + include_tokens_per_second = include_tokens_per_second, + include_num_input_tokens_seen = include_num_input_tokens_seen, + neftune_noise_alpha = neftune_noise_alpha, + optim_target_modules = optim_target_modules, + batch_eval_metrics = batch_eval_metrics, + eval_on_start = eval_on_start, + use_liger_kernel = use_liger_kernel, + liger_kernel_config = liger_kernel_config, + eval_use_gather_object = eval_use_gather_object, + average_tokens_across_devices = average_tokens_across_devices, + reward_model_path = reward_model_path, + judge = judge, + max_new_tokens = max_new_tokens, + max_length = max_length, + temperature = temperature, + missing_eos_penalty = missing_eos_penalty, + loss_type = loss_type, + dataset_num_proc = dataset_num_proc, + disable_dropout = disable_dropout, + use_vllm = use_vllm, + gpu_memory_utilization = gpu_memory_utilization, + ds3_gather_for_generation = ds3_gather_for_generation,**kwargs) + self.vllm_sampling_params = vllm_sampling_params + self.unsloth_num_chunks = unsloth_num_chunks +pass + +class _UnslothXPOTrainer(OnlineDPOTrainer): + r"""""" + + _tag_names = ["trl", "xpo"] + + def __init__( + self, + model: Union[PreTrainedModel, nn.Module] = None, + ref_model: Union[PreTrainedModel, nn.Module] = None, + reward_model: Optional[nn.Module] = None, + judge: Optional[BasePairwiseJudge] = None, + args: Optional[XPOConfig] = None, + data_collator: Optional[Callable] = None, + train_dataset: Optional[Union[Dataset, IterableDataset]] = None, + eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None, + processing_class: Optional[ + Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin] + ] = None, + peft_config: Optional[dict] = None, + compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None, + callbacks: Optional[list[TrainerCallback]] = None, + optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), + preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, + ) -> None: + super().__init__( + model=model, + ref_model=ref_model, + judge=judge, + reward_model=reward_model, + args=args, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + processing_class=processing_class, + reward_processing_class=processing_class, # for now, XPOTrainer can't use any reward model + peft_config=peft_config, + compute_metrics=compute_metrics, + callbacks=callbacks, + optimizers=optimizers, + preprocess_logits_for_metrics=preprocess_logits_for_metrics, + ) + + self._alpha = self.args.alpha + + # Overwrite the stats dictionary to include XPO specific statistics + self.stats = { + # Remove "non_score_reward", "rlhf_reward", "scores" + # Add "loss/dpo", "loss/xpo" + "loss/dpo": [], + "loss/xpo": [], + "objective/kl": [], + "objective/entropy": [], + "rewards/chosen": [], + "rewards/rejected": [], + "rewards/accuracies": [], + "rewards/margins": [], + "logps/chosen": [], + "logps/rejected": [], + # Replace "contain_eos_token" by "model_contain_eos_token" and "ref_contain_eos_token" + "val/model_contain_eos_token": [], + "val/ref_contain_eos_token": [], + "alpha": [], + "beta": [], + } + if self.reward_model is not None: + # Replace "scores" by "model_scores" and "ref_scores" + self.stats["objective/model_scores"] = [] + self.stats["objective/ref_scores"] = [] + self.stats["objective/scores_margin"] = [] + + @property + def alpha(self): + if isinstance(self._alpha, list): + epoch = self.state.epoch + return self._alpha[epoch] if epoch < len(self._alpha) else self._alpha[-1] + else: + return self._alpha + + def _generate_completions(self, prompts, model): + with unwrap_model_for_generation(model, self.accelerator) as unwrapped_policy_model_for_gen: + model_output = unwrapped_policy_model_for_gen.generate( + input_ids=prompts["input_ids"], + attention_mask=prompts["attention_mask"], + generation_config=self.generation_config, + ) + + actual_model_for_ref_generation: torch.nn.Module + if self.ref_model is None: + unwrapped_main_model_for_ref_logic = self.accelerator.unwrap_model(model) + + if is_peft_available() and isinstance(unwrapped_main_model_for_ref_logic, PeftModel): + actual_model_for_ref_generation = unwrapped_main_model_for_ref_logic.get_base_model() + else: + actual_model_for_ref_generation = unwrapped_main_model_for_ref_logic + else: + actual_model_for_ref_generation = self.accelerator.unwrap_model(self.ref_model) + + with unwrap_model_for_generation(actual_model_for_ref_generation, self.accelerator) as final_ref_model_for_gen: + ref_output = final_ref_model_for_gen.generate( + input_ids=prompts["input_ids"], + attention_mask=prompts["attention_mask"], + generation_config=self.generation_config, + ) + + return model_output, ref_output + + def _process_completions(self, model_output, ref_output, prompts): + context_length = prompts["input_ids"].shape[1] + + # Process model completions + model_completion_ids = model_output[:, context_length:] + model_completion_ids, model_completion_mask = truncate_right( + model_completion_ids, self.processing_class.eos_token_id, self.processing_class.pad_token_id + ) + model_data = { + "input_ids": torch.cat((prompts["input_ids"], model_completion_ids), dim=1), + "attention_mask": torch.cat((prompts["attention_mask"], model_completion_mask), dim=1), + "raw": prompts["raw"], + } + + # Process reference model completions + ref_completion_ids = ref_output[:, context_length:] + ref_completion_ids, ref_completion_mask = truncate_right( + ref_completion_ids, self.processing_class.eos_token_id, self.processing_class.pad_token_id + ) + ref_data = { + "input_ids": torch.cat((prompts["input_ids"], ref_completion_ids), dim=1), + "attention_mask": torch.cat((prompts["attention_mask"], ref_completion_mask), dim=1), + "raw": prompts["raw"], + } + + return model_data, ref_data + + def _compute_rewards(self, model_data, ref_data, context_length): + with torch.no_grad(): + _, model_scores, _ = get_reward( + self.reward_model, model_data["input_ids"], self.processing_class.pad_token_id, context_length + ) + _, ref_scores, _ = get_reward( + self.reward_model, ref_data["input_ids"], self.processing_class.pad_token_id, context_length + ) + + # Apply EOS penalty if needed + if self.args.missing_eos_penalty is not None: + model_contain_eos = torch.any(model_data["input_ids"] == self.processing_class.eos_token_id, dim=-1) + ref_contain_eos = torch.any(ref_data["input_ids"] == self.processing_class.eos_token_id, dim=-1) + model_scores[~model_contain_eos] -= self.args.missing_eos_penalty + ref_scores[~ref_contain_eos] -= self.args.missing_eos_penalty + + return model_scores, ref_scores + + def _compute_judge(self, model_data, ref_data, context_length): + prompts = model_data["raw"] + model_data_completions = self.processing_class.batch_decode( + model_data["input_ids"][:, context_length:], skip_special_tokens=True + ) + model_data_completions = [completion.strip() for completion in model_data_completions] + + ref_data_completions = self.processing_class.batch_decode( + ref_data["input_ids"][:, context_length:], skip_special_tokens=True + ) + ref_data_completions = [completion.strip() for completion in ref_data_completions] + + if is_conversational({"prompt": prompts[0]}): + model_data_completions = [ + [{"role": "assistant", "content": completion}] for completion in model_data_completions + ] + environment = jinja2.Environment() + template = environment.from_string(SIMPLE_CHAT_TEMPLATE) + prompts = [template.render(messages=message) for message in prompts] + model_data_completions = [template.render(messages=completion) for completion in model_data_completions] + + ref_data_completions = [ + [{"role": "assistant", "content": completion}] for completion in ref_data_completions + ] + ref_data_completions = [template.render(messages=completion) for completion in ref_data_completions] + + ranks_of_first_completion = self.judge.judge( + prompts, + list(zip(model_data_completions, ref_data_completions)), + ) + # convert ranks to a True/False mask: + # when rank == 0, it means the first completion is the best + # when rank == 1, it means the second completion is the best + return torch.tensor([rank == 0 for rank in ranks_of_first_completion], device=model_data["input_ids"].device) + + def _compute_logprobs(self, model, model_data, ref_data, context_length): + def compute_logprobs_for_data(m, data): + output = m(data["input_ids"], attention_mask=data["attention_mask"]) + logits = output.logits[:, context_length - 1 : -1] + token_logprobs = selective_log_softmax(logits, data["input_ids"][:, context_length:]) + return token_logprobs + + # Compute logprobs for model completions + model_logprobs_model_data = compute_logprobs_for_data(model, model_data) + # Compute logprobs for model on reference completions (for XPO loss) + model_logprobs_ref_data = compute_logprobs_for_data(model, ref_data) + + # Compute logprobs for reference model completions + with torch.no_grad(): + if self.ref_model is None: + with model.disable_adapter(): + ref_logprobs_model_data = compute_logprobs_for_data(model, model_data) + ref_logprobs_ref_data = compute_logprobs_for_data(model, ref_data) + else: + ref_logprobs_model_data = compute_logprobs_for_data(self.ref_model, model_data) + ref_logprobs_ref_data = compute_logprobs_for_data(self.ref_model, ref_data) + + # Mask padding tokens + model_padding_mask = model_data["attention_mask"][:, context_length:] == 0 + ref_padding_mask = ref_data["attention_mask"][:, context_length:] == 0 + model_logprobs_model_data = model_logprobs_model_data.masked_fill(model_padding_mask, 0.0) + model_logprobs_ref_data = model_logprobs_ref_data.masked_fill(ref_padding_mask, 0.0) + ref_logprobs_ref_data = ref_logprobs_ref_data.masked_fill(ref_padding_mask, 0.0) + ref_logprobs_model_data = ref_logprobs_model_data.masked_fill(model_padding_mask, 0.0) + + return model_logprobs_model_data, model_logprobs_ref_data, ref_logprobs_ref_data, ref_logprobs_model_data + + def _compute_losses( + self, + model_logprobs_model_data, + model_logprobs_ref_data, + ref_logprobs_ref_data, + ref_logprobs_model_data, + chosen_mask, + ): + # Compute log probs + model_logprobs_model_data_sum = model_logprobs_model_data.sum(1) + model_logprobs_ref_data_sum = model_logprobs_ref_data.sum(1) + ref_logprobs_ref_data_sum = ref_logprobs_ref_data.sum(1) + ref_logprobs_model_data_sum = ref_logprobs_model_data.sum(1) + + chosen_model_logprobs = torch.where(chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum) + chosen_ref_logprobs = torch.where(chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum) + chosen_log_ratios = chosen_model_logprobs - chosen_ref_logprobs + + rejected_model_logprobs = torch.where(~chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum) + rejected_ref_logprobs = torch.where(~chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum) + rejected_log_ratios = rejected_model_logprobs - rejected_ref_logprobs + + # Compute logits as the difference between chosen and rejected log ratios + logits = chosen_log_ratios - rejected_log_ratios + + if self.args.loss_type == "sigmoid": + dpo_losses = -F.logsigmoid(self.beta * logits) + elif self.args.loss_type == "ipo": + dpo_losses = (logits - 1 / (2 * self.beta)) ** 2 + else: + raise NotImplementedError(f"invalid loss type {self.args.loss_type}") + + # Compute XPO specific loss + xpo_losses = self.alpha * model_logprobs_ref_data_sum + + # Total loss + loss = (dpo_losses + xpo_losses).mean() + + return loss, dpo_losses, xpo_losses + + def _log_statistics( + self, + model_data, + ref_data, + model_logprobs_model_data, + model_logprobs_ref_data, + ref_logprobs_ref_data, + ref_logprobs_model_data, + chosen_mask, + dpo_losses, + xpo_losses, + context_length, + model_scores=None, + ref_scores=None, + ): + # Helper function to gather and compute mean + def gather_mean(tensor): + return self.accelerator.gather_for_metrics(tensor).mean().item() + + # Log losses + self.stats["loss/dpo"].append(gather_mean(dpo_losses)) + self.stats["loss/xpo"].append(gather_mean(xpo_losses)) + + # Log scores + if self.reward_model is not None: + self.stats["objective/model_scores"].append(gather_mean(model_scores)) + self.stats["objective/ref_scores"].append(gather_mean(ref_scores)) + self.stats["objective/scores_margin"].append(gather_mean(model_scores - ref_scores)) + + # Log logprobs + model_logprobs_model_data_sum = model_logprobs_model_data.sum(1) + model_logprobs_ref_data_sum = model_logprobs_ref_data.sum(1) + ref_logprobs_ref_data_sum = ref_logprobs_ref_data.sum(1) + ref_logprobs_model_data_sum = ref_logprobs_model_data.sum(1) + + chosen_model_logprobs = torch.where(chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum) + chosen_ref_logprobs = torch.where(chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum) + chosen_log_ratios = chosen_model_logprobs - chosen_ref_logprobs + + rejected_model_logprobs = torch.where(~chosen_mask, model_logprobs_model_data_sum, model_logprobs_ref_data_sum) + rejected_ref_logprobs = torch.where(~chosen_mask, ref_logprobs_model_data_sum, ref_logprobs_ref_data_sum) + rejected_log_ratios = rejected_model_logprobs - rejected_ref_logprobs + + self.stats["logps/chosen"].append(gather_mean(chosen_model_logprobs.mean() + chosen_ref_logprobs.mean())) + self.stats["logps/rejected"].append(gather_mean(rejected_model_logprobs.mean() + rejected_ref_logprobs.mean())) + + # Log rewards + # Compute various statistics + chosen_rewards = chosen_log_ratios * self.beta + rejected_rewards = rejected_log_ratios * self.beta + self.stats["rewards/chosen"].append(gather_mean(chosen_rewards.mean())) + self.stats["rewards/rejected"].append(gather_mean(rejected_rewards.mean())) + + # Calculate KL divergence for model and ref data + kl_model_data = model_logprobs_model_data - ref_logprobs_model_data + kl_ref_data = model_logprobs_ref_data - ref_logprobs_ref_data + mean_kl = (kl_model_data.sum(1) + kl_ref_data.sum(1)).mean() / 2 + self.stats["objective/kl"].append(gather_mean(mean_kl)) + + # Calculate entropy for model and ref data + entropy_model_data = -model_logprobs_model_data.sum(1) + entropy_ref_data = -model_logprobs_ref_data.sum(1) + mean_entropy = (entropy_model_data.mean() + entropy_ref_data.mean()) / 2 + self.stats["objective/entropy"].append(gather_mean(mean_entropy)) + + # Calculate margins + margin = chosen_rewards - rejected_rewards + self.stats["rewards/margins"].append(gather_mean(margin.mean())) + + # Calculate accuracy + accuracy = (margin > 0).float() + self.stats["rewards/accuracies"].append(gather_mean(accuracy.mean())) + + # Log EOS token statistics + model_eos = (model_data["input_ids"][:, context_length:] == self.processing_class.eos_token_id).any(dim=1) + ref_eos = (ref_data["input_ids"][:, context_length:] == self.processing_class.eos_token_id).any(dim=1) + self.stats["val/model_contain_eos_token"].append(gather_mean(model_eos.float())) + self.stats["val/ref_contain_eos_token"].append(gather_mean(ref_eos.float())) + + # Log alpha and beta + self.stats["alpha"].append(self.alpha) + self.stats["beta"].append(self.beta) + + def training_step( + self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], num_items_in_batch: Optional[int] = None + ) -> torch.Tensor: + model.train() + + # Apply chat template and tokenize the input + batch_size = len(next(iter(inputs.values()))) + prompts = inputs["prompt"] + inputs = [{k: v[i] for k, v in inputs.items()} for i in range(batch_size)] + inputs = [maybe_apply_chat_template(x, self.processing_class) for x in inputs] + inputs = [self.tokenize_row(x, self.model.config.is_encoder_decoder, self.processing_class) for x in inputs] + inputs = self.data_collator(inputs) + + # need the prompt_ only + inputs = self._prepare_inputs(inputs) + context_length = inputs["prompt_input_ids"].shape[1] + prompts = { + "input_ids": inputs["prompt_input_ids"], + "attention_mask": inputs["prompt_attention_mask"], + "raw": prompts, + } + del inputs + + # Sample completions from both the model and the reference model + model_output, ref_output = self._generate_completions(prompts, model) + + # Process model completions + model_data, ref_data = self._process_completions(model_output, ref_output, prompts) + + # Compute rewards + if self.reward_model is not None: + model_scores, ref_scores = self._compute_rewards(model_data, ref_data, context_length) + chosen_mask = model_scores >= ref_scores + else: + model_scores, ref_scores = None, None + chosen_mask = self._compute_judge(model_data, ref_data, context_length) + + # Compute logprobs + model_logprobs_model_data, model_logprobs_ref_data, ref_logprobs_ref_data, ref_logprobs_model_data = ( + self._compute_logprobs(model, model_data, ref_data, context_length) + ) + + # Compute loss + loss, dpo_losses, xpo_losses = self._compute_losses( + model_logprobs_model_data, + model_logprobs_ref_data, + ref_logprobs_ref_data, + ref_logprobs_model_data, + chosen_mask, + ) + + # Log everything + self._log_statistics( + model_data, + ref_data, + model_logprobs_model_data.detach(), + model_logprobs_ref_data.detach(), + ref_logprobs_ref_data, + ref_logprobs_model_data, + chosen_mask, + dpo_losses.detach(), + xpo_losses.detach(), + context_length, + model_scores, + ref_scores, + ) + + if ( + self.args.torch_empty_cache_steps is not None + and self.state.global_step % self.args.torch_empty_cache_steps == 0 + ): + empty_cache() + + kwargs = {} + # For LOMO optimizers you need to explicitly use the learning rate + if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]: + kwargs["learning_rate"] = self._get_learning_rate() + + if self.args.n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu parallel training + + if self.use_apex: + with amp.scale_loss(loss, self.optimizer) as scaled_loss: + scaled_loss.backward() + else: + self.accelerator.backward(loss, **kwargs) + + return loss.detach() / self.args.gradient_accumulation_steps + + def create_model_card( + self, + model_name: Optional[str] = None, + dataset_name: Optional[str] = None, + tags: Union[str, list[str], None] = None, + ): + """ + Creates a draft of a model card using the information available to the `Trainer`. + + Args: + model_name (`str` or `None`, *optional*, defaults to `None`): + Name of the model. + dataset_name (`str` or `None`, *optional*, defaults to `None`): + Name of the dataset used for training. + tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`): + Tags to be associated with the model card. + """ + if not self.is_world_process_zero(): + return + + if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path): + base_model = self.model.config._name_or_path + else: + base_model = None + + # normalize `tags` to a mutable set + if tags is None: + tags = set() + elif isinstance(tags, str): + tags = {tags} + else: + tags = set(tags) + + if hasattr(self.model.config, "unsloth_version"): + tags.add("unsloth") + + tags.update(self._tag_names) + + citation = textwrap.dedent("""\ + @article{jung2024binary, + title = {{Exploratory Preference Optimization: Harnessing Implicit Q*-Approximation for Sample-Efficient RLHF}}, + author = {Tengyang Xie and Dylan J. Foster and Akshay Krishnamurthy and Corby Rosset and Ahmed Awadallah and Alexander Rakhlin}, + year = 2024, + eprint = {arXiv:2405.21046} + }""") + + model_card = generate_model_card( + base_model=base_model, + model_name=model_name, + hub_model_id=self.hub_model_id, + dataset_name=dataset_name, + tags=tags, + wandb_url=wandb.run.get_url() if is_wandb_available() and wandb.run is not None else None, + comet_url=get_comet_experiment_url(), + trainer_name="XPO", + trainer_citation=citation, + paper_title="Exploratory Preference Optimization: Harnessing Implicit Q*-Approximation for Sample-Efficient RLHF", + paper_id="2405.21046", + ) + + model_card.save(os.path.join(self.args.output_dir, "README.md")) +class UnslothXPOTrainer(_UnslothXPOTrainer): + """ + + Initialize XPOTrainer as a subclass of [`OnlineDPOConfig`]. + + Args: + model (`transformers.PreTrainedModel`): + The model to train, preferably an `AutoModelForCausalLM`. + ref_model (`PreTrainedModelWrapper`): + Hugging Face transformer model with a casual language modelling head. Used for implicit reward computation + and loss. If no reference model is provided, the trainer will create a reference model with the same + architecture as the model to be optimized. + reward_model (`transformers.PreTrainedModel`): + The reward model to score completions with, preferably an `AutoModelForSequenceClassification`. + judge (`BasePairwiseJudge`): + The judge to use for pairwise comparison of model completions. + args (`XPOConfig`): + The XPO config arguments to use for training. + data_collator (`transformers.DataCollator`): + The data collator to use for training. If None is specified, the default data collator + (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the + sequences in the batch, given a dataset of paired sequences. + train_dataset (`datasets.Dataset`): + The dataset to use for training. + eval_dataset (`datasets.Dataset`): + The dataset to use for evaluation. + processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*): + Processing class used to process the data. If provided, will be used to automatically process the inputs + for the model, and it will be saved along the model to make it easier to rerun an interrupted training or + reuse the fine-tuned model. + peft_config (`dict`): + The peft config to use for training. + compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*): + The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to + metric values. + callbacks (`list[transformers.TrainerCallback]`): + The callbacks to use for training. + optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`): + The optimizer and scheduler to use for training. + preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`): + The function to use to preprocess the logits before computing the metrics. + + """ + def __init__( + self, + model = None, + ref_model = None, + reward_model = None, + judge = None, + args = None, + data_collator = None, + train_dataset = None, + eval_dataset = None, + processing_class = None, + peft_config = None, + compute_metrics = None, + callbacks = None, + preprocess_logits_for_metrics = None, + **kwargs + ): + if args is None: args = UnslothXPOConfig() + use_bf16 = getattr(args, 'bf16', False) + if type(use_bf16) is not bool: use_bf16 = False + use_fp16 = getattr(args, 'fp16', False) + if type(use_fp16) is not bool: use_fp16 = False + force_float32 = False + if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1': + print('Unsloth: Switching to float32 training since model cannot work with float16') + force_float32 = True + mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') + dtype = getattr(model.config, 'torch_dtype', None) + if dtype is None: dtype = model.get_input_embeddings().dtype + from unsloth_zoo.utils import _get_dtype + dtype = _get_dtype(dtype) + float16 = dtype == torch.float16 + if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`') + if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`') + if force_float32: + args.fp16 = False + args.bf16 = False + os.environ['ACCELERATE_MIXED_PRECISION'] = 'no' + elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32': + args.fp16 = float16 + args.bf16 = not float16 + os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16' + if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no': + args.eval_strategy = 'steps' + if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1 + ga_steps = getattr(args, 'gradient_accumulation_steps', None) + if ga_steps is not None and ga_steps > 1: + from transformers import __version__ as transformers_version + if Version(transformers_version) <= Version('4.45.2'): + print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n' + '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`') + if getattr(args, 'eval_strategy', 'no') != 'no': + eval_bsz = getattr(args, 'per_device_eval_batch_size', 8) + if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size + if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps + fp16_full_eval = getattr(args, 'fp16_full_eval', False) + if type(fp16_full_eval) is not bool: fp16_full_eval = False + bf16_full_eval = getattr(args, 'bf16_full_eval', False) + if type(bf16_full_eval) is not bool: bf16_full_eval = False + if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True + if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False + if force_float32: + args.bf16_full_eval = False + args.fp16_full_eval = False + elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16': + args.bf16_full_eval = True + args.fp16_full_eval = False + elif not bf16_full_eval and not fp16_full_eval: + args.bf16_full_eval = args.bf16 + args.fp16_full_eval = args.fp16 + _output_logits = False + if locals().get('compute_metrics', None) is not None: _output_logits = True + if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True + if _output_logits: + os.environ['UNSLOTH_RETURN_LOGITS'] = '1' + if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'): + pass + else: + model_max_seq_length = getattr(model, 'max_seq_length', None) + args_max_seq_length = getattr(args, 'max_seq_length', None) + if args_max_seq_length is None and model_max_seq_length is not None: + max_seq_length = model.max_seq_length + if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length + if model is not None and hasattr(model, 'for_training'): + model.for_training() + if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right' + if 'processing_class' in locals(): + if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right' + if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right' + __tokenizer = processing_class if 'processing_class' in locals() else tokenizer + from unsloth_zoo.vision_utils import UnslothVisionDataCollator + if not isinstance(data_collator, UnslothVisionDataCollator): + if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer, mlm = False, mlm_probability = 0.0) + elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names: + data_collator = DataCollatorForSeq2Seq(__tokenizer) + else: + if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False + if hasattr(args, 'dataset_text_field'): args.dataset_text_field = '' + if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True} + if not isinstance(data_collator, UnslothVisionDataCollator): + if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'): + if isinstance(data_collator, DataCollatorForSeq2Seq): + data_collator = DataCollatorForSeq2Seq(__tokenizer.tokenizer) + else: + data_collator = TransformersDataCollatorForLanguageModeling(__tokenizer.tokenizer, mlm = False, mlm_probability = 0.0) + other_metrics = [] + + from unsloth_zoo.logging_utils import PatchRLStatistics + PatchRLStatistics('xpo_trainer', other_metrics) + + super().__init__( + model = model, + ref_model = ref_model, + reward_model = reward_model, + judge = judge, + args = args, + data_collator = data_collator, + train_dataset = train_dataset, + eval_dataset = eval_dataset, + processing_class = processing_class, + peft_config = peft_config, + compute_metrics = compute_metrics, + callbacks = callbacks, + preprocess_logits_for_metrics = preprocess_logits_for_metrics,**kwargs) + if hasattr(self, 'neftune_hook_handle'): + self.neftune_hook_handle.remove() + if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle + if getattr(args, 'neftune_noise_alpha', None) is not None: + model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha + pass + +pass diff --git a/unsloth_compiled_cache/__pycache__/UnslothAlignPropTrainer.cpython-311.pyc b/unsloth_compiled_cache/__pycache__/UnslothAlignPropTrainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea62992f796cc9987596e7e22574399e18f437f8 Binary files /dev/null and b/unsloth_compiled_cache/__pycache__/UnslothAlignPropTrainer.cpython-311.pyc differ diff --git a/unsloth_compiled_cache/__pycache__/UnslothBCOTrainer.cpython-311.pyc b/unsloth_compiled_cache/__pycache__/UnslothBCOTrainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f3a09b920475a2c7fbc9ef81ba138e9087f9b1ab Binary files /dev/null and b/unsloth_compiled_cache/__pycache__/UnslothBCOTrainer.cpython-311.pyc differ diff --git a/unsloth_compiled_cache/__pycache__/UnslothCPOTrainer.cpython-311.pyc b/unsloth_compiled_cache/__pycache__/UnslothCPOTrainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e844707aa5565af47759d48d682dd026b9157a1a Binary files /dev/null and b/unsloth_compiled_cache/__pycache__/UnslothCPOTrainer.cpython-311.pyc differ diff --git a/unsloth_compiled_cache/__pycache__/UnslothDDPOTrainer.cpython-311.pyc b/unsloth_compiled_cache/__pycache__/UnslothDDPOTrainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8756eb6be51b87dd7eb122395b79ab104cdd81ec Binary files /dev/null and b/unsloth_compiled_cache/__pycache__/UnslothDDPOTrainer.cpython-311.pyc differ diff --git a/unsloth_compiled_cache/__pycache__/UnslothDPOTrainer.cpython-311.pyc b/unsloth_compiled_cache/__pycache__/UnslothDPOTrainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a7ee712c1beaabb33e8effcc494bea8dd913262f --- /dev/null +++ b/unsloth_compiled_cache/__pycache__/UnslothDPOTrainer.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b7a255c186fdba9be966ebdda959302b60acff0b8bedced84d923c153d1d910 +size 119151 diff --git a/unsloth_compiled_cache/__pycache__/UnslothGKDTrainer.cpython-311.pyc b/unsloth_compiled_cache/__pycache__/UnslothGKDTrainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ca6f9f95b99d26d2863d9ed66d52936591632296 Binary files /dev/null and b/unsloth_compiled_cache/__pycache__/UnslothGKDTrainer.cpython-311.pyc differ diff --git a/unsloth_compiled_cache/__pycache__/UnslothGRPOTrainer.cpython-311.pyc b/unsloth_compiled_cache/__pycache__/UnslothGRPOTrainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e18582dab661a09ac7ccef799ed74b340cf9733 --- /dev/null +++ b/unsloth_compiled_cache/__pycache__/UnslothGRPOTrainer.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:148636c05fa9a32fea988fb4f89360232556f0ee3badb8040ce6d018bbcc02ad +size 129080 diff --git a/unsloth_compiled_cache/__pycache__/UnslothIterativeSFTTrainer.cpython-311.pyc b/unsloth_compiled_cache/__pycache__/UnslothIterativeSFTTrainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1956929037f7de92997e668aabd11a3b5fbba46a Binary files /dev/null and b/unsloth_compiled_cache/__pycache__/UnslothIterativeSFTTrainer.cpython-311.pyc differ diff --git a/unsloth_compiled_cache/__pycache__/UnslothKTOTrainer.cpython-311.pyc b/unsloth_compiled_cache/__pycache__/UnslothKTOTrainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb1873b6512e28d0f1f7242b9cfdbf5df8c3f8c1 Binary files /dev/null and b/unsloth_compiled_cache/__pycache__/UnslothKTOTrainer.cpython-311.pyc differ diff --git a/unsloth_compiled_cache/__pycache__/UnslothNashMDTrainer.cpython-311.pyc b/unsloth_compiled_cache/__pycache__/UnslothNashMDTrainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b077a286e844f9facd11ecfefcdd935de0a0cc56 Binary files /dev/null and b/unsloth_compiled_cache/__pycache__/UnslothNashMDTrainer.cpython-311.pyc differ diff --git a/unsloth_compiled_cache/__pycache__/UnslothORPOTrainer.cpython-311.pyc b/unsloth_compiled_cache/__pycache__/UnslothORPOTrainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c6137c6b84599f1c309be3a9d2590192c3361b40 Binary files /dev/null and b/unsloth_compiled_cache/__pycache__/UnslothORPOTrainer.cpython-311.pyc differ diff --git a/unsloth_compiled_cache/__pycache__/UnslothOnlineDPOTrainer.cpython-311.pyc b/unsloth_compiled_cache/__pycache__/UnslothOnlineDPOTrainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a495d6a2a2748d5a2edfdfeb96a79ece2c03a91 Binary files /dev/null and b/unsloth_compiled_cache/__pycache__/UnslothOnlineDPOTrainer.cpython-311.pyc differ diff --git a/unsloth_compiled_cache/__pycache__/UnslothPPOTrainer.cpython-311.pyc b/unsloth_compiled_cache/__pycache__/UnslothPPOTrainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d99105928a9e24651b32bb0a3820dbd6b058743 Binary files /dev/null and b/unsloth_compiled_cache/__pycache__/UnslothPPOTrainer.cpython-311.pyc differ diff --git a/unsloth_compiled_cache/__pycache__/UnslothPRMTrainer.cpython-311.pyc b/unsloth_compiled_cache/__pycache__/UnslothPRMTrainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b8edb013a838bfbf53aad78730a0d684a2fe9e88 Binary files /dev/null and b/unsloth_compiled_cache/__pycache__/UnslothPRMTrainer.cpython-311.pyc differ diff --git a/unsloth_compiled_cache/__pycache__/UnslothRLOOTrainer.cpython-311.pyc b/unsloth_compiled_cache/__pycache__/UnslothRLOOTrainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..89fb43595f4a2dd2a4dc658a4f642ab15624bbbc Binary files /dev/null and b/unsloth_compiled_cache/__pycache__/UnslothRLOOTrainer.cpython-311.pyc differ diff --git a/unsloth_compiled_cache/__pycache__/UnslothRewardTrainer.cpython-311.pyc b/unsloth_compiled_cache/__pycache__/UnslothRewardTrainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c29f652d3f1ef5737c1899236469e490576efd8f Binary files /dev/null and b/unsloth_compiled_cache/__pycache__/UnslothRewardTrainer.cpython-311.pyc differ diff --git a/unsloth_compiled_cache/__pycache__/UnslothSFTTrainer.cpython-311.pyc b/unsloth_compiled_cache/__pycache__/UnslothSFTTrainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20c54ffa755cf46adb765b220b76d9128d6c217b Binary files /dev/null and b/unsloth_compiled_cache/__pycache__/UnslothSFTTrainer.cpython-311.pyc differ diff --git a/unsloth_compiled_cache/__pycache__/UnslothXPOTrainer.cpython-311.pyc b/unsloth_compiled_cache/__pycache__/UnslothXPOTrainer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..442423df195161adbcb33973d7c7fb537901a511 Binary files /dev/null and b/unsloth_compiled_cache/__pycache__/UnslothXPOTrainer.cpython-311.pyc differ